diff --git a/cmd/router/main.go b/cmd/router/main.go index 58727d8f..0eb50639 100644 --- a/cmd/router/main.go +++ b/cmd/router/main.go @@ -44,6 +44,13 @@ func main() { mtlsCert = flag.String("mtls-cert", "", "Path to mTLS client certificate for upstream WorkloadManager connections") mtlsKey = flag.String("mtls-key", "", "Path to mTLS client key for upstream WorkloadManager connections") mtlsCA = flag.String("mtls-ca", "", "Path to mTLS CA bundle for verifying upstream WorkloadManager identity") + + // OIDC configuration for external user authentication. + // External auth is automatically enabled when --oidc-issuer-url is provided. + oidcIssuerURL = flag.String("oidc-issuer-url", "", "OIDC provider issuer URL, enables external auth when set") + oidcAudience = flag.String("oidc-audience", "agentcube-api", "Expected audience (aud) claim in the access token") + oidcRolesClaim = flag.String("oidc-roles-claim", "", "JSON path to roles array in the JWT (e.g., realm_access.roles)") + oidcRequiredRole = flag.String("oidc-required-role", "", "Role required to access the API (e.g., sandbox:invoke)") ) // Initialize klog flags @@ -76,6 +83,10 @@ func main() { TLSKey: *tlsKey, MaxConcurrentRequests: *maxConcurrentRequests, MTLSConfig: tlsConfig, + OIDCIssuerURL: *oidcIssuerURL, + OIDCAudience: *oidcAudience, + OIDCRolesClaim: *oidcRolesClaim, + OIDCRequiredRole: *oidcRequiredRole, } // Create Router API server diff --git a/docs/agentcube/docs/tutorials/external-auth-keycloak.md b/docs/agentcube/docs/tutorials/external-auth-keycloak.md new file mode 100644 index 00000000..00028124 --- /dev/null +++ b/docs/agentcube/docs/tutorials/external-auth-keycloak.md @@ -0,0 +1,441 @@ +# Securing External Access with Keycloak (OIDC) + +This task shows you how to add external authentication and authorization to +AgentCube using [Keycloak](https://www.keycloak.org/) as the identity provider. +By the end, every request to the Router API will require a valid JWT token +issued by Keycloak, and access will be controlled by realm roles (RBAC) and +resource ownership (RLAC). + +## Before you begin + +1. Follow the [Getting Started](../getting-started.md) guide to install + AgentCube on your cluster and deploy the `sample-agent` runtime. **Do not** + enable OIDC during the initial installation - this tutorial walks through + that step explicitly. + +2. Make sure you have the following tools installed: + - [`kubectl`](https://kubernetes.io/docs/tasks/tools/) (v1.25+) + - [`helm`](https://helm.sh/docs/intro/install/) (v3.12+) + - [`curl`](https://curl.se/) (any recent version) + - [`jq`](https://jqlang.github.io/jq/download/) (for parsing JSON + responses) + +3. Confirm AgentCube is running without external auth: + + ```bash + kubectl get pods -n agentcube-system + ``` + + You should see the Router and WorkloadManager pods in `Running` state: + +``` + NAME READY STATUS RESTARTS AGE + agentcube-router-699fb7784d-6plc9 2/2 Running 0 15s + spire-agent-jg7mm 1/1 Running 3 (31m ago) 31m + spire-server-0 2/2 Running 0 31m + workloadmanager-57c547f945-zqwf6 2/2 Running 3 (30m ago) 31m +``` + +4. Confirm the Router is currently reachable **without** a token. + + Open a **new terminal** and port-forward the Router: + + ```bash + kubectl port-forward svc/agentcube-router -n agentcube-system 8081:8080 + ``` + + In your **original terminal**, check the health endpoint: + + ```bash + curl -s http://localhost:8081/health/live + ``` + + Expected output: + + ```json + {"status":"alive"} + ``` + + Stop the port-forward by pressing `Ctrl+C` in the terminal where it is + running. + +## What gets deployed + +When you install the Keycloak addon and enable OIDC, the following resources are +created or modified: + +| Resource | Kind | Purpose | +|---|---|---| +| `keycloak` | Deployment (1 replica) | Keycloak identity provider. Imports the `agentcube` realm on first startup. | +| `keycloak` | Service (ClusterIP) | Exposes Keycloak on port 8080 inside the cluster. | +| `keycloak-realm-config` | Secret | Contains the realm JSON with roles, clients, and scope mappings. | +| `keycloak-credentials` | Secret | Stores admin password and client secrets. | + +The Keycloak addon also auto-configures the following inside the `agentcube` +realm: + +| Item | Details | +|---|---| +| **Roles** | `sandbox:invoke` (default for all users), `sandbox:manage` (inherits invoke), `admin` (inherits manage) | +| **Clients** | `agentcube-service` (confidential, for SDKs), `agentcube-sdk` (public, for browsers/CLI), `agentcube-admin` (confidential, for admin ops) | +| **Audience** | All tokens include `agentcube-api` in the `aud` claim via a protocol mapper | + +After enabling OIDC on the Router, **all** requests to `/v1/...` endpoints +require a valid Bearer token. Health endpoints (`/health/live`, `/health/ready`) +remain unauthenticated for Kubernetes probes. + +## Step 1 - Deploy the Keycloak addon + +Install Keycloak into your AgentCube namespace using the addon Helm chart. You +must provide an admin username, admin password, and client secrets for the three +confidential clients. Choose strong secrets for production - the values below +are for demonstration only. + +```bash +helm upgrade --install keycloak manifests/charts/addons/keycloak \ + --namespace agentcube-system \ + --set adminUser=admin \ + --set adminPassword=admin \ + --set clients.service.secret=my-service-secret \ + --set clients.router.secret=my-router-secret \ + --set clients.admin.secret=my-admin-secret +``` + +Expected output: + +``` + Release "keycloak" does not exist. Installing it now. + NAME: keycloak + LAST DEPLOYED: Sat Jun 6 03:03:47 2026 + NAMESPACE: agentcube-system + STATUS: deployed + REVISION: 1 + DESCRIPTION: Install complete + TEST SUITE: None +``` + +> **Tip:** +> The addon chart uses `devMode: true` by default, which runs Keycloak with +> an embedded H2 database. This is fine for local development and testing. +> For production deployments, set `devMode=false` and provide an external +> database - see the chart's `values.yaml` for the full set of production +> options. + +Wait for Keycloak to be ready. The JVM startup plus realm import typically +takes 60–90 seconds: + +```bash +kubectl rollout status deployment/keycloak -n agentcube-system --timeout=300s +``` + +Expected output: + +``` +Waiting for deployment "keycloak" rollout to finish: 0 of 1 updated replicas are available... +deployment "keycloak" successfully rolled out +``` + +Verify the Keycloak pod is running: + +```bash +kubectl get pods -n agentcube-system -l app=keycloak +``` + +Expected output: + +``` +NAME READY STATUS RESTARTS AGE +keycloak-f8d7dff7b-b2p24 1/1 Running 0 2m44s +``` + +## Step 2 - Enable OIDC on the Router + +Now configure the Router to validate tokens against Keycloak. This is done by +upgrading the base AgentCube Helm release with OIDC settings. Use +`--reuse-values` so your existing configuration (Redis, images, SPIRE, etc.) is +preserved. + +```bash +helm upgrade agentcube manifests/charts/base \ + -n agentcube-system \ + --reuse-values \ + --set router.oidc.issuerUrl=http://keycloak.agentcube-system.svc.cluster.local:8080/realms/agentcube \ + --set router.oidc.rolesClaim=realm_access.roles \ + --set router.oidc.requiredRole=sandbox:invoke +``` + +Expected output: + +``` +Release "agentcube" has been upgraded. Happy Helming! +NAME: agentcube +LAST DEPLOYED: Sat Jun 6 03:07:07 2026 +NAMESPACE: agentcube-system +STATUS: deployed +REVISION: 4 +DESCRIPTION: Upgrade complete +TEST SUITE: None +``` + +This tells the Router: + +- **`issuerUrl`** - where to discover OIDC configuration and signing keys + (JWKS). The Router fetches these once at startup and caches them. +- **`rolesClaim`** - the dot-separated path inside the JWT where roles are + stored. For Keycloak this is `realm_access.roles`. +- **`requiredRole`** - the minimum role required to access the API. Users + without `sandbox:invoke` will receive a `403 Forbidden`. + +Wait for the Router to restart with the new configuration: + +```bash +kubectl rollout status deployment/agentcube-router -n agentcube-system --timeout=120s +``` + +## Step 3 - Verify that unauthenticated requests are rejected + +Open a **new terminal** and port-forward the Router: + +```bash +kubectl port-forward svc/agentcube-router -n agentcube-system 8081:8080 +``` + +In your **original terminal**, try making a request to the `sample-agent` (which you deployed in the Getting Started guide) **without** a token: + +```bash +curl -s -w "\nHTTP Status: %{http_code}\n" http://localhost:8081/v1/namespaces/default/agent-runtimes/sample-agent/invocations/ +``` + +Expected output — the Router now rejects unauthenticated requests: + +``` +{"code":"UNAUTHORIZED","error":"missing authorization header"} +HTTP Status: 401 +``` + +## Step 4 - Obtain a token from Keycloak + +Open **another new terminal** and port-forward Keycloak so you can reach it +from your local machine: + +```bash +kubectl port-forward svc/keycloak -n agentcube-system 8082:8080 +``` + +Back in your **original terminal**, proceed with the token requests below. + +### Get a service account token + +Use the `client_credentials` grant to obtain a token for the +`agentcube-service` client. Note the `Host` header - this is needed because +Keycloak validates the issuer against the original hostname, and we're +connecting through a port-forward: + +```bash +KEYCLOAK_TOKEN=$(curl -s -X POST \ + -H "Host: keycloak.agentcube-system.svc.cluster.local:8080" \ + "http://localhost:8082/realms/agentcube/protocol/openid-connect/token" \ + -d "grant_type=client_credentials" \ + -d "client_id=agentcube-service" \ + -d "client_secret=my-service-secret" | jq -r '.access_token') + +echo $KEYCLOAK_TOKEN +``` + +Expected output - a long base64-encoded JWT string: + +### Inspect the token (optional) + +You can decode the token to see its claims: + +```bash +echo $KEYCLOAK_TOKEN | cut -d'.' -f2 | base64 -d 2>/dev/null | jq . +``` + +Expected output - you should see `realm_access.roles` containing +`sandbox:invoke`, and `aud` containing `agentcube-api`: + +``` +{ + "exp": 1780695969, + "iat": 1780695669, + "jti": "31ef712f-f62a-4a4e-8714-7ccf710ab476", + "iss": "http://keycloak.agentcube-system.svc.cluster.local:8080/realms/agentcube", + "aud": "agentcube-api", + "sub": "bb0f4c04-e1d2-4bdb-b7bb-56d5c1cefe50", + "typ": "Bearer", + "azp": "agentcube-service", + "acr": "1", + "realm_access": { + "roles": [ + "sandbox:invoke" + ] + }, + "scope": "profile email", + "email_verified": false, + "clientHost": "127.0.0.1", + "preferred_username": "service-account-agentcube-service", + "clientAddress": "127.0.0.1", + "client_id": "agentcube-service" +} +``` + +### Get an admin token + +The `agentcube-admin` client has the `admin` role, which grants full access +including the ability to bypass resource ownership checks (RLAC): + +```bash +ADMIN_TOKEN=$(curl -s -X POST \ + -H "Host: keycloak.agentcube-system.svc.cluster.local:8080" \ + "http://localhost:8082/realms/agentcube/protocol/openid-connect/token" \ + -d "grant_type=client_credentials" \ + -d "client_id=agentcube-admin" \ + -d "client_secret=my-admin-secret" | jq -r '.access_token') + +echo $ADMIN_TOKEN +``` + +## Step 5 - Make an authenticated request + +Now use the token to make an authenticated request through the Router: + +```bash +curl -s -w "\nHTTP Status: %{http_code}\n" \ + -H "Authorization: Bearer $KEYCLOAK_TOKEN" \ + http://localhost:8081/v1/namespaces/default/agent-runtimes/sample-agent/invocations/ +``` + +Expected output - the request should now succeed (the exact response depends on +what agent runtimes you have deployed): + +``` +HTTP Status: 200 +``` + +## Step 6 - Verify RBAC enforcement + +Try using an **invalid** token to confirm the Router rejects it: + +```bash +curl -s -w "\nHTTP Status: %{http_code}\n" \ + -H "Authorization: Bearer invalid-token-here" \ + http://localhost:8081/v1/namespaces/default/agent-runtimes/sample-agent/invocations/ +``` + +Expected output : + +``` +{"code":"UNAUTHORIZED","error":"invalid or expired token"} +HTTP Status: 401 +``` + +## Step 7 - Use the Python SDK with authentication + +The AgentCube Python SDK supports Keycloak authentication via the +`ServiceAccountAuth` provider. This uses the `client_credentials` grant and +automatically refreshes tokens before they expire. + +### Install the SDK + +```bash +pip install -e sdk-python/ +``` + +### Example usage + +Create a Python script (or run in a Python REPL): + +```python +from agentcube import ServiceAccountAuth +from agentcube import AgentRuntimeClient + +# Initialize authentication with Keycloak +auth = ServiceAccountAuth( + token_url="http://localhost:8082/realms/agentcube/protocol/openid-connect/token", + client_id="agentcube-service", + client_secret="my-service-secret", + headers={"Host": "keycloak.agentcube-system.svc.cluster.local:8080"}, +) + +# Create the client - it will automatically attach Bearer tokens to requests +client = AgentRuntimeClient( + router_url="http://localhost:8081", + namespace="default", + agent_name="sample-agent", + auth=auth, +) + +# Invoke the agent +response = client.invoke(payload={}) +print(f"Response: {response}") +``` + +Expected output: + +If you are using the mock `sample-agent` from the Getting Started guide (which runs Python's built-in `http.server`), the script will raise a `501 Server Error: Not Implemented` exception. This is expected because `http.server` only handles `GET`/`HEAD` requests and doesn't implement `POST` handlers. A real agent framework (e.g., FastAPI or Flask) implementing `POST` handlers will return the actual response. + +The `ServiceAccountAuth` class handles the full token lifecycle: +- Fetches a new token on the first request +- Caches the token in memory +- Automatically refreshes 30 seconds before expiry + +## Understanding what changed + +The Helm chart passes four `--oidc-*` flags to the Router when +`router.oidc.issuerUrl` is set. The Router discovers Keycloak's signing keys +once at startup (via OIDC discovery), caches them, and validates every incoming +JWT locally — no per-request calls to Keycloak. + +> **Note:** The `--import-realm` flag only applies on Keycloak's **first +> startup**. Changes to the realm Secret after that will **not** take effect +> automatically. Use the Keycloak Admin Console or Admin API to update a live +> realm. + +### Realm roles + +| Role | Permissions | Assigned to | +|------|-------------|-------------| +| `sandbox:invoke` | Invoke agent runtimes and code interpreters | Default role (all users/clients) | +| `sandbox:manage` | Create/delete AgentRuntime and CodeInterpreter CRDs. Inherits `sandbox:invoke`. | — | +| `admin` | Full administrative access. Bypasses ownership checks. Inherits `sandbox:manage`. | `agentcube-admin` client | + +### OAuth2 clients + +| Client ID | Type | Grant | Purpose | +|-----------|------|-------|---------| +| `agentcube-service` | Confidential | `client_credentials` | Python SDK, external services (M2M) | +| `agentcube-sdk` | Public | Authorization code + PKCE | CLI, browser-based apps (interactive) | +| `agentcube-admin` | Confidential | `client_credentials` | Administrative operations | + +## Cleanup + +Stop all port-forwards by pressing `Ctrl+C` in each terminal where they are +running. + +If you want to **disable** external auth and go back to unauthenticated access, +remove the OIDC configuration from the Router: + +```bash +helm upgrade agentcube manifests/charts/base \ + -n agentcube-system \ + --reuse-values \ + --set router.oidc.issuerUrl="" \ + --set router.oidc.rolesClaim="" \ + --set router.oidc.requiredRole="" +``` + +Wait for the Router to restart: + +```bash +kubectl rollout status deployment/agentcube-router -n agentcube-system --timeout=120s +``` + +To also remove the Keycloak addon entirely: + +```bash +helm uninstall keycloak -n agentcube-system +``` + +This removes the Keycloak Deployment, Service, and associated Secrets from the +cluster. diff --git a/docs/design/keycloak-proposal.md b/docs/design/keycloak-proposal.md new file mode 100644 index 00000000..9edb612f --- /dev/null +++ b/docs/design/keycloak-proposal.md @@ -0,0 +1,330 @@ +# Keycloak Integration Design + +Author: Mahil Patel + +## Motivation + +AgentCube currently has no mechanism to authenticate external callers, anyone who can reach the Router endpoint can invoke agent runtimes and code interpreters without proving their identity. This proposal adds external authentication and authorization using Keycloak as the identity provider, covering OIDC token validation at the Router, role-based access control (RBAC), resource-level access control (RLAC), identity forwarding to downstream services, and Python SDK auth support. Keycloak is deployed as a separate addon chart, and the core chart enables auth when `router.oidc.issuerUrl` is set, so existing deployments are unaffected. + +## Architecture + +### Auth layers + +| Layer | Purpose | Mechanism | Status | +|-------|---------|-----------|--------| +| Internal (transport) | Machine identity between components | mTLS with SPIRE/file-based certificates | Existing | +| Internal (application) | Bind Router to PicoD sessions | Router-signed JWT with session claims | Existing | +| **External (authentication)** | **Prove human/SDK caller identity** | **Keycloak OIDC JWT validation** | **This proposal** | +| **External (authorization)** | **Enforce access rules** | **RBAC (realm roles) + RLAC (owner labels)** | **This proposal** | + +### End-to-end request flow + +```mermaid +sequenceDiagram + actor SDK as SDK / Client + participant KC as Keycloak + participant Router as Router + participant WM as WorkloadManager + participant PicoD as PicoD (Sandbox) + + Note over SDK, KC: 1. Token Acquisition + SDK->>KC: POST /realms/agentcube/protocol/openid-connect/token
(client_credentials: agentcube-service + client_secret) + KC-->>SDK: Access Token (JWT) + + Note over SDK, PicoD: 2. Authenticated Invocation + SDK->>Router: POST /v1/namespaces/.../invocations/...
Authorization: Bearer + + Note over Router: 3. Edge Authentication + Router->>Router: Validate JWT signature (cached JWKS) + Router->>Router: Check expiry, issuer, audience + Router->>Router: Extract roles from configured claim (e.g. realm_access.roles) + + Note over Router: 4. RBAC Check + Router->>Router: Require configured role + + Note over Router, WM: 5. Session Creation (if new) + Router->>Router: Sign identity JWT (sub, iss, aud, exp) + Router->>WM: POST /v1/agent-runtime
Authorization: Bearer
X-AgentCube-User-Identity: + WM->>WM: Verify identity JWT (Router public key)
Record ownership (annotation + hashed label) + WM-->>Router: Sandbox info (sessionId, endpoints, ownerId) + + Note over Router: 6. RLAC Check (if existing session) + Router->>Router: Verify sandbox owner matches JWT sub + + Note over Router, PicoD: 7. Proxy to Sandbox + Router->>Router: Sign internal JWT (session_id + user_sub) + Router->>PicoD: Forward request
Authorization: Bearer + PicoD->>PicoD: Verify internal JWT (existing flow) + PicoD-->>Router: Response + Router-->>SDK: Response +``` + +### Key decisions + +**JWKS-based offline validation.** The Router fetches Keycloak's public signing keys via OIDC discovery at startup. The `go-oidc` library caches and auto-rotates these keys - no per-request call to Keycloak. + +**Forwarding user identity via signed tokens, not plain headers.** The Router embeds the user's `sub` claim into existing channels: +- For PicoD: added as a `user_sub` claim in the Router-signed internal JWT (allows the agent runtime to trace actions back to a specific human user for logging, context, and downstream internal AuthZ). +- For WorkloadManager: the Router creates and signs a **new, short-lived internal JWT** (containing just the user's ID) and sends it in the `X-AgentCube-User-Identity` header. We do not pass the original Keycloak token, keeping WM decoupled from the external IDP. + +WorkloadManager and PicoD know they can trust this user identity because the internal JWT is cryptographically signed by the Router's private key. They verify the signature using the Router's public key (which is distributed via the `picod-router-identity` Kubernetes Secret). This guarantees the user identity was actually verified by the Router and prevents anyone from spoofing a fake identity header. + +## Detailed Design + +### 1. Keycloak Helm Deployment + +Keycloak is deployed as a **separate addon chart** (`manifests/charts/addons/keycloak/`), independent of the core AgentCube chart. This keeps the core chart provider-agnostic - it only needs an OIDC issuer URL, which works with Keycloak, Okta, Auth0, or any OIDC-compliant provider. The addon chart **bootstraps** the initial realm, clients, and roles — it is not a declarative management tool. After first startup, realm changes must be made via the Keycloak Admin API or Admin Console. + +The Deployment runs the official `quay.io/keycloak/keycloak` image and supports two modes: + +- **Dev mode** (`keycloak.devMode: true`, default): Runs `start-dev` with an embedded H2 database. Suitable for local development and testing. (Replicas must be 1). +- **Production mode** (`keycloak.devMode: false`): Runs `start` and requires an external database, an external Secret for credentials, and a public hostname. For production HA, you can increase `replicas` in the addon chart values (which requires an external database), though for complex enterprise setups we recommend the [Keycloak Operator](https://www.keycloak.org/operator/installation). + +The Deployment uses `--import-realm` to load the realm configuration from a mounted Secret (`keycloak-realm-config`) on first startup. The `--import-realm` flag automatically skips existing realms - subsequent restarts do not overwrite the realm, so Helm value changes to the realm JSON will not take effect on an existing database. The pod runs as non-root with all capabilities dropped. + +The Service exposes Keycloak on port 8080 (configurable) as a ClusterIP service. + +#### Realm Configuration + +The realm JSON defines a role hierarchy where each higher role inherits the ones below: + +``` +admin + └── sandbox:manage (create/delete AgentRuntime and CodeInterpreter CRDs) + └── sandbox:invoke (invoke agent runtimes and code interpreters) +``` + +`sandbox:invoke` is assigned to the default realm role, so every new user gets it automatically. + +**OAuth2 Clients:** + +| Client ID | Type | Purpose | +|-----------|------|---------| +| `agentcube-service` | Confidential (`client_credentials`) | External backend applications and automated scripts using the Python SDK (Machine-to-Machine / M2M) | +| `agentcube-sdk` | Public (`authorization_code` + PKCE) | Human end users interacting via the CLI or browser (Interactive) | +| `agentcube-router` | Confidential (`client_credentials`) | Internal Router service identity | +| `agentcube-admin` | Confidential (`client_credentials`) | Administrative operations | + +Confidential client secrets can be provided securely via an existing Kubernetes Secret (`keycloak.clients.existingSecret`) to prevent leaking them in Helm values. They are injected into the Keycloak pod as environment variables and securely substituted into the realm JSON during import. The `agentcube-sdk` client is a **public client** (no secret) that uses authorization code with PKCE for interactive flows, following RFC 8252 (OAuth 2.0 for Native Apps). The `agentcube-service` client is confidential and used for server-side `client_credentials` flows where a secret can be stored securely. Both clients have `sandbox:invoke` mapped via `scopeMappings`. + +Both clients include a **hardcoded audience protocol mapper** (`oidc-audience-mapper`) that injects `agentcube-api` into the access token's `aud` claim. The Router validates `aud = "agentcube-api"`. This follows OAuth2 convention — the audience identifies the resource server (the Router API), not the client that requested the token. + +For production mode, the chart includes validation guards that fail the render if required values like `existingSecret`, `database.vendor`, or `proxy.hostname` are missing. + +### 2. OIDC Token Validation (Router) + +The Router uses the `coreos/go-oidc` library to validate incoming JWTs. This is the standard OIDC library in the Go ecosystem - Kubernetes itself uses it. + +**New file: `pkg/router/oidc.go`** + +```go +type OIDCConfig struct { + IssuerURL string // e.g. "http://keycloak.agentcube-system.svc:8080/realms/agentcube" + Audience string // expected "aud" claim, e.g. "agentcube-api" + RolesClaim string // dot-separated path to roles array, e.g. "realm_access.roles" +} + +type Claims struct { + Subject string // from standard "sub" claim + Email string // from standard "email" claim + Roles []string // extracted from the configured RolesClaim path +} +``` + +Roles are extracted dynamically from the JWT using the configured `RolesClaim` path. For example, `realm_access.roles` means: parse the JWT payload as JSON, navigate into the `realm_access` object, then read the `roles` array. This makes the middleware work with any OIDC provider: + +| Provider | `--oidc-roles-claim` value | +|----------|---------------------------| +| Keycloak | `realm_access.roles` (default) | +| Auth0 | `https://myapp.com/roles` | +| Okta | `groups` | +| Azure AD | `roles` | + +The `OIDCValidator` uses `go-oidc` for JWKS discovery and key caching (`oidc.NewProvider()`), but validates the token as an **OAuth2 access token**, not an ID token. Keycloak's `client_credentials` grant returns an access token, and while Keycloak issues these as signed JWTs using the same keys, the audience semantics differ from ID tokens. `ValidateToken` verifies the JWT signature against cached JWKS keys, then explicitly checks `iss`, `exp`, `nbf`, `aud`, and extracts roles from the configured claim path — all locally, no per-request call to Keycloak. + +### 3. Authentication Middleware (Router) + +**New file: `pkg/router/auth.go`** + +Two gin middleware functions : + +- **`oidcAuthMiddleware()`** - extracts the Bearer token, validates it via the OIDC validator, stores Claims in context. No-op when auth is disabled. +- **`requireRole(role)`** - checks the extracted roles list (from the configured claim path) for the required role. Returns 403 if missing. + +Applied to the `/v1` route group: + +```go +v1 := s.engine.Group("/v1") +v1.Use(s.oidcAuthMiddleware()) // validate JWT +if s.oidcValidator != nil { + // Require configured role + v1.Use(requireRole(s.config.OIDCRequiredRole)) +} +v1.Use(s.concurrencyLimitMiddleware()) // existing +``` + +Health endpoints skip authentication - they must remain accessible for Kubernetes probes. + +### 4. Identity Forwarding + +#### Router → PicoD + +The Router already signs an internal JWT for each proxied request. When external auth is enabled, the caller's `sub` claim is embedded in this JWT: + +```go +claims := map[string]interface{}{ + "session_id": sandbox.SessionID, +} +if oidcClaims := extractClaims(c); oidcClaims != nil { + claims["user_sub"] = oidcClaims.Subject +} +``` + +PicoD doesn't need any changes - the extra claim is simply available if it ever needs to read it. + +#### Router → WorkloadManager + +The `createSandbox` method in `session_manager.go` signs a short-lived identity JWT using the Router's existing private key (the same `picod-router-identity` key from the PicoD auth design) and sends it as a header: + +```go +identityClaims := map[string]interface{}{ + "sub": claims.Subject, + "iss": "agentcube-router", + "aud": "workloadmanager", + "exp": time.Now().Add(30 * time.Second).Unix(), +} +identityToken, _ := s.jwtManager.GenerateToken(identityClaims) +req.Header.Set("X-AgentCube-User-Identity", identityToken) +``` + +WM verifies this JWT using the Router's public key from the `picod-router-identity` Secret. This provides cryptographic proof of the user identity without depending on mTLS — the identity is trustworthy regardless of transport security configuration. WM continues to authenticate the Router itself via the existing K8s SA token. + +### 5. RLAC - Resource-Level Access Control + +RLAC ensures users can only interact with sandboxes they created. + +**Ownership tagging (WorkloadManager):** When WM creates a sandbox, it verifies the `X-AgentCube-User-Identity` JWT, extracts the `sub` claim, and records ownership in two ways: + +```go +Annotations: map[string]string{ + "agentcube.io/owner": userID, // raw sub from verified identity JWT +} +Labels: map[string]string{ + "agentcube.io/owner-hash": sha256Short(userID), // first 63 chars of hex SHA-256 +} +``` + +The raw `sub` is stored in an annotation (no length/charset restrictions) and in Redis (`SandboxInfo.OwnerID`) for authoritative ownership checks. The hashed label is used only for Kubernetes label-based selection if needed. Keycloak UUIDs (36 chars) would fit as labels directly, but federated or pairwise subject identifiers can exceed the 63-character Kubernetes label limit, so we hash defensively. + +The owner ID is returned in the create sandbox response so the Router can persist it in its Redis cache. + +**Ownership verification (Router):** Before proxying to an existing sandbox, the Router checks if the caller owns it. When auth is enabled, the check is **fail-closed** — if the owner is missing (legacy sandbox, cache issue, WM bug), access is denied rather than silently allowed: + +```go +if s.oidcValidator != nil { + claims := extractClaims(c) + if claims != nil { + // Admin role bypasses RLAC ownership checks + if slices.Contains(claims.Roles, "admin") { + return true + } + + if sandbox.OwnerID == "" { + c.JSON(http.StatusForbidden, gin.H{"error": "sandbox has no owner record"}) + return + } + if sandbox.OwnerID != claims.Subject { + c.JSON(http.StatusForbidden, gin.H{"error": "you do not own this sandbox"}) + return + } + } +} +``` + +This check is skipped when external auth is disabled. Sandboxes created before auth was enabled will be inaccessible once auth is turned on, which is the intended behavior. + +### 6. Python SDK Auth + +The Python SDK currently supports an explicit `auth_token` string or reading a K8s service account token from a file. Neither supports `client_credentials` flow or token refresh. + +We add a pluggable auth provider pattern: + +**New file: `sdk-python/agentcube/auth.py`** + +```python +@runtime_checkable +class AuthProvider(Protocol): + def get_token(self) -> str: ... + +class ServiceAccountAuth: + """Authenticates using OAuth2 client_credentials grant against Keycloak.""" + def __init__(self, token_url: str, client_id: str, client_secret: str): ... + def get_token(self) -> str: + # Returns cached token, refreshes 30s before expiry + ... + +class TokenAuth: + """Wraps a pre-obtained token. No refresh support.""" + def __init__(self, token: str): ... + def get_token(self) -> str: ... +``` + +The existing clients (`ControlPlaneClient`, Data Plane clients, high-level clients) are updated to accept an `auth` parameter. The `auth_token` string parameter is kept for backward compatibility. Each request calls `self._auth.get_token()` to get a fresh token. `ServiceAccountAuth` is used with the `agentcube-service` client (confidential, `client_credentials`). For interactive CLI flows, a separate `DeviceCodeAuth` or browser-based flow using the public `agentcube-sdk` client can be added later. + +### 7. Helm Wiring and CLI Flags + +The core AgentCube chart has provider-agnostic OIDC configuration under `router.oidc`. When `router.oidc.issuerUrl` is set, the Router Deployment template passes additional args: + +```yaml +{{- if .Values.router.oidc.issuerUrl }} +- {{ printf "--oidc-issuer-url=%s" .Values.router.oidc.issuerUrl | quote }} +- {{ printf "--oidc-audience=%s" .Values.router.oidc.audience | quote }} +- {{ printf "--oidc-roles-claim=%s" .Values.router.oidc.rolesClaim | quote }} +- {{ printf "--oidc-required-role=%s" .Values.router.oidc.requiredRole | quote }} +{{- end }} +``` + +When using the Keycloak addon chart, the user sets the issuer URL to point at the in-cluster Keycloak service: + +```bash +# 1. Deploy the Keycloak addon +helm install keycloak manifests/charts/addons/keycloak -n agentcube-system \ + --set adminUser=admin --set adminPassword=admin \ + --set clients.service.secret=my-svc-secret \ + --set clients.router.secret=my-router-secret \ + --set clients.admin.secret=my-admin-secret + +# 2. Deploy AgentCube with OIDC pointed at the addon +helm install agentcube manifests/charts/base -n agentcube-system \ + --set router.oidc.issuerUrl=http://keycloak.agentcube-system.svc:8080/realms/agentcube \ + --set router.oidc.rolesClaim=realm_access.roles \ + --set router.oidc.requiredRole=sandbox:invoke +``` + +Four new flags in `cmd/router/main.go`: + +| Flag | Default | Description | +|------|---------|-------------| +| `--oidc-issuer-url` | `""` | OIDC provider issuer URL | +| `--oidc-audience` | `"agentcube-api"` | Expected JWT audience claim | +| `--oidc-roles-claim` | `""` | REQUIRED if issuer is set. Dot-separated path to the roles array in the JWT (e.g. `realm_access.roles` for Keycloak, `groups` for Okta). | +| `--oidc-required-role` | `""` | REQUIRED if issuer is set. The role required to access the API endpoints (e.g. `sandbox:invoke` - Note: this is just an example, though our Keycloak addon creates this role automatically). | + +External authentication is automatically enabled when `--oidc-issuer-url` is provided. The Router will validate tokens against this issuer. + +## Testing + +**Unit Tests:** +- `pkg/router/oidc_test.go` - OIDC validator tests using `httptest` as a fake JWKS server. +- `pkg/router/auth_test.go` - middleware tests: missing header, invalid token, valid token, role checks. +- WorkloadManager owner label tests - verify labels are applied during sandbox creation. +- Python SDK auth tests - `ServiceAccountAuth` token refresh, `TokenAuth` static token, backward compatibility. + +**E2E Tests:** + +The E2E suite (`test/e2e/`) will be extended to deploy Keycloak in the Kind cluster: + +1. Preload the Keycloak image into Kind +2. Deploy with the Keycloak addon and set `router.oidc.issuerUrl` in the base chart +3. Obtain a real token via `client_credentials` grant +4. Test cases: no token → 401, invalid token → 401, valid token → success, RLAC ownership → 403 diff --git a/go.mod b/go.mod index f0a23c80..06c63e00 100644 --- a/go.mod +++ b/go.mod @@ -7,8 +7,9 @@ toolchain go1.24.9 require ( github.com/agiledragon/gomonkey/v2 v2.13.0 github.com/alicebob/miniredis/v2 v2.35.0 - github.com/gin-contrib/gzip v1.0.1 + github.com/coreos/go-oidc/v3 v3.14.1 github.com/fsnotify/fsnotify v1.9.0 + github.com/gin-contrib/gzip v1.0.1 github.com/gin-gonic/gin v1.10.0 github.com/golang-jwt/jwt/v5 v5.2.2 github.com/google/uuid v1.6.0 @@ -40,6 +41,7 @@ require ( github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/gabriel-vasile/mimetype v1.4.3 // indirect github.com/gin-contrib/sse v0.1.0 // indirect + github.com/go-jose/go-jose/v4 v4.1.3 // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect diff --git a/go.sum b/go.sum index 5c75dfcd..e0f89f39 100644 --- a/go.sum +++ b/go.sum @@ -20,6 +20,8 @@ github.com/cloudwego/base64x v0.1.4 h1:jwCgWpFanWmN8xoIUHa2rtzmkd5J2plF/dnLS6Xd/ github.com/cloudwego/base64x v0.1.4/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w= github.com/cloudwego/iasm v0.2.0 h1:1KNIy1I1H9hNNFEEH3DVnI4UujN+1zjpuk6gwHLTssg= github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY= +github.com/coreos/go-oidc/v3 v3.14.1 h1:9ePWwfdwC4QKRlCXsJGou56adA/owXczOzwKdOumLqk= +github.com/coreos/go-oidc/v3 v3.14.1/go.mod h1:HaZ3szPaZ0e4r6ebqvsLWlk2Tn+aejfmrfah6hnSYEU= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -43,6 +45,8 @@ github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI= github.com/gin-gonic/gin v1.10.0 h1:nTuyha1TYqgedzytsKYqna+DfLos46nTv2ygFy86HFU= github.com/gin-gonic/gin v1.10.0/go.mod h1:4PMNQiOhvDRa013RKVbsiNwoyezlm2rm0uX/T7kzp5Y= +github.com/go-jose/go-jose/v4 v4.1.3 h1:CVLmWDhDVRa6Mi/IgCgaopNosCaHz7zrMeF9MlZRkrs= +github.com/go-jose/go-jose/v4 v4.1.3/go.mod h1:x4oUasVrzR7071A4TnHLGSPpNOm2a21K9Kf04k1rs08= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= @@ -242,6 +246,8 @@ golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= golang.org/x/oauth2 v0.32.0 h1:jsCblLleRMDrxMN29H3z/k1KliIvpLgCkE6R8FXXNgY= golang.org/x/oauth2 v0.32.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= +golang.org/x/oauth2 v0.36.0 h1:peZ/1z27fi9hUOFCAZaHyrpWG5lwe0RJEEEeH0ThlIs= +golang.org/x/oauth2 v0.36.0/go.mod h1:YDBUJMTkDnJS+A4BP4eZBjCqtokkg1hODuPjwiGPO7Q= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= diff --git a/manifests/charts/addons/keycloak/Chart.yaml b/manifests/charts/addons/keycloak/Chart.yaml new file mode 100644 index 00000000..7d5f1578 --- /dev/null +++ b/manifests/charts/addons/keycloak/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v2 +name: agentcube-keycloak +description: Keycloak identity provider addon for AgentCube +version: 0.1.0 +appVersion: "26.0.8" diff --git a/manifests/charts/addons/keycloak/templates/deployment.yaml b/manifests/charts/addons/keycloak/templates/deployment.yaml new file mode 100644 index 00000000..a9738cee --- /dev/null +++ b/manifests/charts/addons/keycloak/templates/deployment.yaml @@ -0,0 +1,209 @@ +{{- if empty .Values.adminUser }} + {{- fail "adminUser is required" }} +{{- end }} +{{- if and .Values.devMode (gt (int (default 1 .Values.replicas)) 1) }} + {{- fail "replicas cannot be greater than 1 when devMode is true. Keycloak dev mode uses an embedded H2 database which does not support clustering." }} +{{- end }} +{{- if .Values.database.vendor }} +{{- if empty .Values.database.url }} + {{- fail "database.url is required when database.vendor is set" }} +{{- end }} +{{- if empty .Values.database.username }} + {{- fail "database.username is required when database.vendor is set" }} +{{- end }} +{{- end }} +{{- if not .Values.devMode }} +{{- if not (or (and .Values.proxy.httpEnabled .Values.proxy.headers) .Values.tls.existingSecret) }} + {{- fail "Production mode (devMode=false) requires either: (1) TLS-termination at proxy (set proxy.httpEnabled=true AND proxy.headers) OR (2) TLS-termination at Keycloak (set tls.existingSecret)" }} +{{- end }} +{{- if empty .Values.existingSecret }} + {{- fail "existingSecret is required when devMode is false (production mode requires an external Secret for the admin password)" }} +{{- end }} +{{- if empty .Values.database.vendor }} + {{- fail "database.vendor is required when devMode is false (production mode requires an external database)" }} +{{- end }} +{{- if empty .Values.database.existingSecret }} + {{- fail "database.existingSecret is required when devMode is false (production mode requires an external Secret for the database password)" }} +{{- end }} +{{- if empty .Values.proxy.hostname }} + {{- fail "proxy.hostname is required when devMode is false (production mode requires a strict hostname)" }} +{{- end }} +{{- range .Values.sdk.redirectUris }} + {{- if contains "*" . }} + {{- fail "Wildcard redirect URIs (e.g. '*') are not allowed in production mode (devMode=false) due to open redirect risks. Please provide explicit, fully-qualified URIs for sdk.redirectUris." }} + {{- end }} +{{- end }} +{{- range .Values.sdk.webOrigins }} + {{- if contains "*" . }} + {{- fail "Wildcard web origins (e.g. '*') are not allowed in production mode (devMode=false). Please provide explicit, fully-qualified URIs for sdk.webOrigins." }} + {{- end }} +{{- end }} +{{- end }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: keycloak + namespace: {{ .Release.Namespace }} + labels: + app: keycloak +spec: + replicas: {{ .Values.replicas | default 1 }} + selector: + matchLabels: + app: keycloak + template: + metadata: + labels: + app: keycloak + spec: + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + runAsNonRoot: true + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: keycloak + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + # Note: --import-realm only applies on an empty database/first start. + # Realm Secret updates will not be re-applied on subsequent pod restarts. + {{- if .Values.devMode }} + args: ["start-dev", "--import-realm"] + {{- else }} + args: ["start", "--import-realm"] + {{- end }} + ports: + - name: http + containerPort: {{ .Values.service.port }} + protocol: TCP + - name: management + containerPort: 9000 + protocol: TCP + env: + - name: KC_BOOTSTRAP_ADMIN_USERNAME + value: {{ .Values.adminUser | quote }} + - name: KC_BOOTSTRAP_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: {{ .Values.existingSecret | default "keycloak-credentials" }} + key: {{ if .Values.existingSecret }}{{ .Values.existingSecretKey | default "admin-password" }}{{ else }}admin-password{{ end }} + - name: KC_HEALTH_ENABLED + value: "true" + - name: KC_HTTP_PORT + value: {{ .Values.service.port | quote }} + {{- if .Values.database.vendor }} + - name: KC_DB + value: {{ .Values.database.vendor | quote }} + - name: KC_DB_URL + value: {{ .Values.database.url | quote }} + - name: KC_DB_USERNAME + value: {{ .Values.database.username | quote }} + - name: KC_DB_PASSWORD + valueFrom: + secretKeyRef: + name: {{ .Values.database.existingSecret | default "keycloak-credentials" }} + key: {{ if .Values.database.existingSecret }}{{ .Values.database.existingSecretKey | default "db-password" }}{{ else }}db-password{{ end }} + {{- end }} + {{- if .Values.proxy.headers }} + - name: KC_PROXY_HEADERS + value: {{ .Values.proxy.headers | quote }} + {{- end }} + {{- if .Values.proxy.httpEnabled }} + - name: KC_HTTP_ENABLED + value: "true" + {{- end }} + {{- if .Values.proxy.hostname }} + - name: KC_HOSTNAME + value: {{ .Values.proxy.hostname | quote }} + {{- end }} + {{- if .Values.tls.existingSecret }} + - name: KC_HTTPS_CERTIFICATE_FILE + value: "/opt/keycloak/conf/certs/tls.crt" + - name: KC_HTTPS_CERTIFICATE_KEY_FILE + value: "/opt/keycloak/conf/certs/tls.key" + {{- end }} + - name: KC_CLIENT_SERVICE_SECRET + valueFrom: + secretKeyRef: + name: {{ .Values.clients.existingSecret | default "keycloak-credentials" }} + key: {{ if .Values.clients.existingSecret }}{{ .Values.clients.existingSecretServiceKey | default "client-service-secret" }}{{ else }}client-service-secret{{ end }} + - name: KC_CLIENT_ROUTER_SECRET + valueFrom: + secretKeyRef: + name: {{ .Values.clients.existingSecret | default "keycloak-credentials" }} + key: {{ if .Values.clients.existingSecret }}{{ .Values.clients.existingSecretRouterKey | default "client-router-secret" }}{{ else }}client-router-secret{{ end }} + - name: KC_CLIENT_ADMIN_SECRET + valueFrom: + secretKeyRef: + name: {{ .Values.clients.existingSecret | default "keycloak-credentials" }} + key: {{ if .Values.clients.existingSecret }}{{ .Values.clients.existingSecretAdminKey | default "client-admin-secret" }}{{ else }}client-admin-secret{{ end }} + volumeMounts: + - name: realm-config + mountPath: /opt/keycloak/data/import + readOnly: true + {{- if .Values.tls.existingSecret }} + - name: certificates + mountPath: /opt/keycloak/conf/certs + readOnly: true + {{- end }} + # Keycloak's JVM + realm import takes 60-90s. The startup probe gives a 300s + # window before liveness checks begin, preventing premature pod restarts. + startupProbe: + httpGet: + path: /health/started + port: management + failureThreshold: 30 + periodSeconds: 10 + livenessProbe: + httpGet: + path: /health/live + port: management + periodSeconds: 30 + readinessProbe: + httpGet: + path: /health/ready + port: management + periodSeconds: 10 + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumes: + - name: realm-config + secret: + secretName: keycloak-realm-config + {{- if .Values.tls.existingSecret }} + - name: certificates + secret: + secretName: {{ .Values.tls.existingSecret }} + items: + - key: {{ .Values.tls.secretCertificateKey | default "tls.crt" }} + path: tls.crt + - key: {{ .Values.tls.secretPrivateKeyKey | default "tls.key" }} + path: tls.key + {{- end }} + +--- +apiVersion: v1 +kind: Service +metadata: + name: keycloak + namespace: {{ .Release.Namespace }} + labels: + app: keycloak +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + app: keycloak diff --git a/manifests/charts/addons/keycloak/templates/realm-config.yaml b/manifests/charts/addons/keycloak/templates/realm-config.yaml new file mode 100644 index 00000000..4c54ac90 --- /dev/null +++ b/manifests/charts/addons/keycloak/templates/realm-config.yaml @@ -0,0 +1,197 @@ +apiVersion: v1 +kind: Secret +metadata: + name: keycloak-realm-config + namespace: {{ .Release.Namespace }} + labels: + app: keycloak +type: Opaque +# stringData accepts plain text and Kubernetes base64-encodes it automatically. +stringData: + # Note: The --import-realm flag only applies this JSON on an empty/first start. + # Once the realm exists, subsequent pod restarts will silently IGNORE updates to this Secret. + # To update a live production realm, use the Keycloak Admin API or a declarative tool. + # Filename must follow -realm.json convention for --import-realm to detect it. + {{ printf "%s-realm.json" .Values.realm | quote }}: | + { + "realm": {{ .Values.realm | quote }}, + "enabled": true, + "sslRequired": {{ if .Values.devMode }}"none"{{ else }}"external"{{ end }}, + + "defaultRole": { + "name": "default-roles-{{ .Values.realm }}", + "description": "${role_default-roles}", + "composite": true, + "clientRole": false + }, + + "roles": { + "realm": [ + { + "name": "default-roles-{{ .Values.realm }}", + "description": "${role_default-roles}", + "composite": true, + "clientRole": false, + "composites": { + "realm": [ + "sandbox:invoke" + ] + } + }, + { + "name": "sandbox:invoke", + "description": "Permission to invoke agent runtimes and code interpreters." + }, + { + "name": "sandbox:manage", + "description": "Permission to create/delete AgentRuntime and CodeInterpreter CRDs. Inherits sandbox:invoke.", + "composite": true, + "composites": { + "realm": ["sandbox:invoke"] + } + }, + { + "name": "admin", + "description": "Full administrative access. Inherits sandbox:manage.", + "composite": true, + "composites": { + "realm": ["sandbox:manage"] + } + } + ] + }, + + "clients": [ + { + "clientId": "agentcube-service", + "name": "AgentCube Service", + "description": "Confidential client for server-side authentication using client_credentials grant.", + "enabled": true, + "protocol": "openid-connect", + "publicClient": false, + "secret": "${KC_CLIENT_SERVICE_SECRET}", + "standardFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": true, + "redirectUris": [], + "webOrigins": [], + "protocolMappers": [ + { + "name": "agentcube-api-audience", + "protocol": "openid-connect", + "protocolMapper": "oidc-audience-mapper", + "consentRequired": false, + "config": { + "included.custom.audience": "agentcube-api", + "id.token.claim": "false", + "access.token.claim": "true" + } + } + ] + }, + { + "clientId": "agentcube-sdk", + "name": "AgentCube SDK", + "description": "Public client for interactive SDK and CLI authentication using authorization code with PKCE.", + "enabled": true, + "protocol": "openid-connect", + "publicClient": true, + "standardFlowEnabled": true, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": false, + "redirectUris": {{ .Values.sdk.redirectUris | toJson }}, + "webOrigins": {{ .Values.sdk.webOrigins | toJson }}, + "attributes": { + "pkce.code.challenge.method": "S256" + }, + "protocolMappers": [ + { + "name": "agentcube-api-audience", + "protocol": "openid-connect", + "protocolMapper": "oidc-audience-mapper", + "consentRequired": false, + "config": { + "included.custom.audience": "agentcube-api", + "id.token.claim": "false", + "access.token.claim": "true" + } + } + ] + }, + { + "clientId": "agentcube-router", + "name": "AgentCube Router", + "description": "Internal service client for the Router component.", + "enabled": true, + "protocol": "openid-connect", + "publicClient": false, + "secret": "${KC_CLIENT_ROUTER_SECRET}", + "standardFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": true, + "redirectUris": [], + "webOrigins": [] + }, + { + "clientId": "agentcube-admin", + "name": "AgentCube Admin", + "description": "Confidential client for administrative operations.", + "enabled": true, + "protocol": "openid-connect", + "publicClient": false, + "secret": "${KC_CLIENT_ADMIN_SECRET}", + "standardFlowEnabled": false, + "directAccessGrantsEnabled": false, + "serviceAccountsEnabled": true, + "redirectUris": [], + "webOrigins": [], + "protocolMappers": [ + { + "name": "agentcube-api-audience", + "protocol": "openid-connect", + "protocolMapper": "oidc-audience-mapper", + "consentRequired": false, + "config": { + "included.custom.audience": "agentcube-api", + "id.token.claim": "false", + "access.token.claim": "true" + } + } + ] + } + ], + + "scopeMappings": [ + { + "client": "agentcube-service", + "roles": ["sandbox:invoke"] + }, + { + "client": "agentcube-sdk", + "roles": ["sandbox:invoke"] + }, + { + "client": "agentcube-admin", + "roles": ["sandbox:invoke", "admin"] + } + ], + + "users": [ + { + "username": "service-account-agentcube-admin", + "enabled": true, + "serviceAccountClientId": "agentcube-admin", + "realmRoles": [ + "admin" + ] + }, + { + "username": "service-account-agentcube-service", + "enabled": true, + "serviceAccountClientId": "agentcube-service", + "realmRoles": [ + "sandbox:invoke" + ] + } + ] + } diff --git a/manifests/charts/addons/keycloak/templates/secrets.yaml b/manifests/charts/addons/keycloak/templates/secrets.yaml new file mode 100644 index 00000000..e68fd5b5 --- /dev/null +++ b/manifests/charts/addons/keycloak/templates/secrets.yaml @@ -0,0 +1,27 @@ +{{- $renderAdmin := not .Values.existingSecret }} +{{- $renderDb := and .Values.database.vendor (not .Values.database.existingSecret) }} +{{- $renderClients := not .Values.clients.existingSecret }} +{{- if or $renderAdmin $renderDb $renderClients }} +# Chart-managed Secret for Keycloak admin, database, and/or client credentials. +# Rendered if any of these credentials need to be managed by the chart. +apiVersion: v1 +kind: Secret +metadata: + name: keycloak-credentials + namespace: {{ .Release.Namespace }} + labels: + app: keycloak +type: Opaque +stringData: + {{- if $renderAdmin }} + admin-password: {{ required "adminPassword is required when existingSecret is not set" .Values.adminPassword | quote }} + {{- end }} + {{- if $renderDb }} + db-password: {{ required "database.password is required when database.vendor is set and database.existingSecret is not set" .Values.database.password | quote }} + {{- end }} + {{- if $renderClients }} + client-service-secret: {{ required "clients.service.secret is required when clients.existingSecret is not set" .Values.clients.service.secret | quote }} + client-router-secret: {{ required "clients.router.secret is required when clients.existingSecret is not set" .Values.clients.router.secret | quote }} + client-admin-secret: {{ required "clients.admin.secret is required when clients.existingSecret is not set" .Values.clients.admin.secret | quote }} + {{- end }} +{{- end }} diff --git a/manifests/charts/addons/keycloak/values.yaml b/manifests/charts/addons/keycloak/values.yaml new file mode 100644 index 00000000..6853b760 --- /dev/null +++ b/manifests/charts/addons/keycloak/values.yaml @@ -0,0 +1,69 @@ +# Default values for the Keycloak addon chart. + +# Global settings +replicas: 1 +imagePullSecrets: [] + +# Use start-dev mode (embedded H2 DB, HTTP) for local development. +# Set to false for production — requires database.vendor and proxy settings below. +devMode: true +image: + repository: quay.io/keycloak/keycloak + tag: "26.0.8" + pullPolicy: IfNotPresent +adminUser: "" # Required. +adminPassword: "" # Dev only. Production must use existingSecret. +existingSecret: "" # Name of a pre-existing Secret containing admin credentials +existingSecretKey: "admin-password" # Key within the Secret holding the password +# External database configuration (required when devMode is false) +database: + vendor: "" # Database type: "postgres", "mysql", "mariadb". Empty = embedded H2 (dev only). + url: "" # JDBC URL, e.g., "jdbc:postgresql://postgres:5432/keycloak" + username: "" + password: "" + existingSecret: "" # Name of a pre-existing Secret containing the database password + existingSecretKey: "db-password" +# Reverse proxy configuration (required when Keycloak is behind an Ingress or load balancer) +proxy: + headers: "" # Proxy header mode: "xforwarded" or "forwarded". Empty = disabled. + httpEnabled: false # Allow HTTP traffic when running behind a TLS-terminating proxy + hostname: "" # Public hostname, e.g., "keycloak.example.com". Empty = auto-detect. +service: + type: ClusterIP + port: 8080 +realm: "agentcube" +# Client secrets for confidential OAuth2 clients. +# The agentcube-sdk client is public (PKCE) and does not require a secret. +clients: + existingSecret: "" # Name of a pre-existing Secret containing client secrets + # The keys below are only used if existingSecret is SET. + # They define which keys inside your external Secret hold the actual passwords. + existingSecretServiceKey: "client-service-secret" + existingSecretRouterKey: "client-router-secret" + existingSecretAdminKey: "client-admin-secret" + service: + secret: "" + router: + secret: "" + admin: + secret: "" +# TLS configuration for Keycloak terminating TLS directly (devMode=false only) +tls: + existingSecret: "" # Name of pre-existing Secret containing tls.crt and tls.key + secretCertificateKey: "tls.crt" # Key inside the secret holding the certificate + secretPrivateKeyKey: "tls.key" # Key inside the secret holding the private key +sdk: + # WARNING: Wildcard URIs (e.g. 'http://localhost*') pose security risks (open redirect vulnerabilities) in production. + # When devMode is false, the deployment will fail if these contain wildcards. + # You must override these with explicit, fully-qualified URIs for production. + redirectUris: + - "http://localhost*" + webOrigins: + - "http://localhost*" +resources: + limits: + cpu: 500m + memory: 1Gi + requests: + cpu: 100m + memory: 512Mi diff --git a/manifests/charts/base/templates/agentcube-router.yaml b/manifests/charts/base/templates/agentcube-router.yaml index 20e796ea..1e86c595 100644 --- a/manifests/charts/base/templates/agentcube-router.yaml +++ b/manifests/charts/base/templates/agentcube-router.yaml @@ -1,3 +1,11 @@ +{{- if .Values.router.oidc.issuerUrl }} +{{- if empty .Values.router.oidc.rolesClaim }} + {{- fail "router.oidc.rolesClaim is required when router.oidc.issuerUrl is set" }} +{{- end }} +{{- if empty .Values.router.oidc.requiredRole }} + {{- fail "router.oidc.requiredRole is required when router.oidc.issuerUrl is set" }} +{{- end }} +{{- end }} apiVersion: apps/v1 kind: Deployment metadata: @@ -83,6 +91,12 @@ spec: - --mtls-key={{ .Values.spire.spiffeHelper.certDir }}/{{ .Values.spire.spiffeHelper.keyFileName }} - --mtls-ca={{ .Values.spire.spiffeHelper.certDir }}/{{ .Values.spire.spiffeHelper.bundleFileName }} {{- end }} + {{- if .Values.router.oidc.issuerUrl }} + - {{ printf "--oidc-issuer-url=%s" .Values.router.oidc.issuerUrl | quote }} + - {{ printf "--oidc-audience=%s" .Values.router.oidc.audience | quote }} + - {{ printf "--oidc-roles-claim=%s" .Values.router.oidc.rolesClaim | quote }} + - {{ printf "--oidc-required-role=%s" .Values.router.oidc.requiredRole | quote }} + {{- end }} resources: {{- toYaml .Values.router.resources | nindent 12 }} livenessProbe: diff --git a/manifests/charts/base/values.yaml b/manifests/charts/base/values.yaml index 14e02247..fbe54fcf 100644 --- a/manifests/charts/base/values.yaml +++ b/manifests/charts/base/values.yaml @@ -32,6 +32,14 @@ router: config: {} extraEnv: [] serviceAccountName: "agentcube-router" + # OIDC configuration for external user authentication. + # When oidc.issuerUrl is set, the Router validates Bearer tokens against this OIDC provider. + # Works with any OIDC-compliant provider (Keycloak, Okta, Auth0, etc.). + oidc: + issuerUrl: "" # e.g., "http://keycloak.agentcube-system.svc:8080/realms/agentcube" + audience: "agentcube-api" # Expected "aud" claim in access tokens + rolesClaim: "" # Required if issuerUrl is set. JSON path to the roles array in the JWT (e.g., "realm_access.roles" for Keycloak, "groups" for Okta). + requiredRole: "" # Required if issuerUrl is set. The role required to access the API endpoints (e.g., "sandbox:invoke"). # AgentCube Workload Manager workloadmanager: diff --git a/pkg/common/types/sandbox.go b/pkg/common/types/sandbox.go index b829f780..b79d3b8a 100644 --- a/pkg/common/types/sandbox.go +++ b/pkg/common/types/sandbox.go @@ -30,6 +30,7 @@ type SandboxInfo struct { Name string `json:"name"` EntryPoints []SandboxEntryPoint `json:"entryPoints"` SessionID string `json:"sessionId"` + OwnerID string `json:"ownerID,omitempty"` CreatedAt time.Time `json:"createdAt"` ExpiresAt time.Time `json:"expiresAt"` // IdleTimeout is the per-sandbox idle timeout configured via SessionTimeout on the @@ -62,6 +63,7 @@ type CreateSandboxResponse struct { SandboxID string `json:"sandboxId"` SandboxName string `json:"sandboxName"` EntryPoints []SandboxEntryPoint `json:"entryPoints"` + OwnerID string `json:"ownerID,omitempty"` } func (car *CreateSandboxRequest) Validate() error { diff --git a/pkg/router/auth.go b/pkg/router/auth.go new file mode 100644 index 00000000..ac44cb45 --- /dev/null +++ b/pkg/router/auth.go @@ -0,0 +1,135 @@ +/* +Copyright The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package router + +import ( + "context" + "net/http" + "slices" + "strings" + + "github.com/gin-gonic/gin" + "k8s.io/klog/v2" +) + +// contextKeyType is a private type for context keys to avoid collisions. +type contextKeyType string + +// contextKeyOIDCClaims is the context key for storing validated OIDC claims. +const contextKeyOIDCClaims = contextKeyType("oidcClaims") + +// oidcAuthMiddleware validates incoming OIDC JWTs. +func (s *Server) oidcAuthMiddleware() gin.HandlerFunc { + return func(c *gin.Context) { + if s.oidcValidator == nil { + c.Next() + return + } + + // Extract the Bearer token from the Authorization header. + authHeader := c.GetHeader("Authorization") + if authHeader == "" { + c.JSON(http.StatusUnauthorized, gin.H{ + "error": "missing authorization header", + "code": "UNAUTHORIZED", + }) + c.Abort() + return + } + + parts := strings.SplitN(authHeader, " ", 2) + if len(parts) != 2 || !strings.EqualFold(parts[0], "Bearer") { + c.JSON(http.StatusUnauthorized, gin.H{ + "error": "invalid authorization header format", + "code": "UNAUTHORIZED", + }) + c.Abort() + return + } + + rawToken := strings.TrimSpace(parts[1]) + if rawToken == "" { + c.JSON(http.StatusUnauthorized, gin.H{ + "error": "empty bearer token", + "code": "UNAUTHORIZED", + }) + c.Abort() + return + } + + // Validate the token against the OIDC provider. + claims, err := s.oidcValidator.ValidateToken(c.Request.Context(), rawToken) + if err != nil { + klog.V(2).Infof("OIDC token validation failed: %v", err) + c.JSON(http.StatusUnauthorized, gin.H{ + "error": "invalid or expired token", + "code": "UNAUTHORIZED", + }) + c.Abort() + return + } + + c.Set(string(contextKeyOIDCClaims), claims) + + // Store in request context for downstream code + ctx := context.WithValue(c.Request.Context(), contextKeyOIDCClaims, claims) + c.Request = c.Request.WithContext(ctx) + + c.Next() + } +} + +// requireRole enforces a specific OIDC role for the endpoint. +func requireRole(role string) gin.HandlerFunc { + return func(c *gin.Context) { + claims := extractClaims(c) + if claims == nil { + c.JSON(http.StatusUnauthorized, gin.H{ + "error": "authentication required", + "code": "UNAUTHORIZED", + }) + c.Abort() + return + } + + if !slices.Contains(claims.Roles, role) { + klog.V(2).Infof("RBAC check failed: user %s missing required role %q (has: %v)", + claims.Subject, role, claims.Roles) + c.JSON(http.StatusForbidden, gin.H{ + "error": "insufficient permissions: missing role " + role, + "code": "FORBIDDEN", + }) + c.Abort() + return + } + + c.Next() + } +} + +// extractClaims extracts validated OIDC claims from the Gin context. +func extractClaims(c *gin.Context) *Claims { + val, exists := c.Get(string(contextKeyOIDCClaims)) + if !exists { + return nil + } + claims, ok := val.(*Claims) + if !ok { + return nil + } + return claims +} diff --git a/pkg/router/auth_test.go b/pkg/router/auth_test.go new file mode 100644 index 00000000..3c16a18d --- /dev/null +++ b/pkg/router/auth_test.go @@ -0,0 +1,108 @@ +/* +Copyright The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package router + +import ( + "net/http" + "net/http/httptest" + "testing" + + "github.com/gin-gonic/gin" + "github.com/stretchr/testify/assert" +) + +func init() { + gin.SetMode(gin.TestMode) +} + +func TestOIDCAuthMiddleware_Disabled(t *testing.T) { + // When oidcValidator is nil, middleware should be a passthrough. + server := &Server{oidcValidator: nil} + + w := httptest.NewRecorder() + c, router := gin.CreateTestContext(w) + router.Use(server.oidcAuthMiddleware()) + router.GET("/test", func(c *gin.Context) { + claims := extractClaims(c) + assert.Nil(t, claims, "no claims should be set when auth is disabled") + c.JSON(http.StatusOK, gin.H{"ok": true}) + }) + + c.Request = httptest.NewRequest(http.MethodGet, "/test", nil) + router.ServeHTTP(w, c.Request) + assert.Equal(t, http.StatusOK, w.Code) +} + +func TestOIDCAuthMiddleware_MissingHeader(t *testing.T) { + server := &Server{oidcValidator: &OIDCValidator{}} + + w := httptest.NewRecorder() + _, router := gin.CreateTestContext(w) + router.Use(server.oidcAuthMiddleware()) + router.GET("/test", func(_ *gin.Context) { + t.Error("handler should not be called") + }) + + req := httptest.NewRequest(http.MethodGet, "/test", nil) + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusUnauthorized, w.Code) + assert.Contains(t, w.Body.String(), "missing authorization header") +} + +func TestRequireRole_HasRole(t *testing.T) { + w := httptest.NewRecorder() + _, router := gin.CreateTestContext(w) + + // Pre-set claims in context before the role check + router.Use(func(c *gin.Context) { + c.Set(string(contextKeyOIDCClaims), &Claims{ + Subject: "user-1", + Roles: []string{"sandbox:invoke", "sandbox:manage"}, + }) + c.Next() + }) + router.Use(requireRole("sandbox:invoke")) + router.GET("/test", func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"ok": true}) + }) + + req := httptest.NewRequest(http.MethodGet, "/test", nil) + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusOK, w.Code) +} + +func TestRequireRole_MissingRole(t *testing.T) { + w := httptest.NewRecorder() + _, router := gin.CreateTestContext(w) + + router.Use(func(c *gin.Context) { + c.Set(string(contextKeyOIDCClaims), &Claims{ + Subject: "user-1", + Roles: []string{"sandbox:invoke"}, + }) + c.Next() + }) + router.Use(requireRole("admin")) + router.GET("/test", func(_ *gin.Context) { + t.Error("handler should not be called") + }) + + req := httptest.NewRequest(http.MethodGet, "/test", nil) + router.ServeHTTP(w, req) + assert.Equal(t, http.StatusForbidden, w.Code) + assert.Contains(t, w.Body.String(), "insufficient permissions: missing role admin") +} diff --git a/pkg/router/config.go b/pkg/router/config.go index b123d9e9..7a12dc9d 100644 --- a/pkg/router/config.go +++ b/pkg/router/config.go @@ -56,4 +56,16 @@ type Config struct { // When all paths are present, mutual TLS is used for // Router-to-WorkloadManager connections. MTLSConfig mtls.Config + + // OIDCIssuerURL is the OIDC provider issuer URL. + OIDCIssuerURL string + + // OIDCAudience is the expected audience claim in the JWT. + OIDCAudience string + + // OIDCRolesClaim is the dot-separated JSON path to the roles array in the JWT + OIDCRolesClaim string + + // OIDCRequiredRole is the role required to access the API. + OIDCRequiredRole string } diff --git a/pkg/router/handlers.go b/pkg/router/handlers.go index 3b44d3fa..811b87ce 100644 --- a/pkg/router/handlers.go +++ b/pkg/router/handlers.go @@ -23,6 +23,7 @@ import ( "net/http" "net/http/httputil" "net/url" + "slices" "strings" "time" @@ -69,6 +70,11 @@ func (s *Server) handleInvoke(c *gin.Context, namespace, name, path, kind string return } + // RLAC: verify the caller owns this sandbox (only for existing sessions) + if sessionID != "" && !s.checkSandboxOwnership(c, sandbox) { + return + } + // Update session activity in store when receiving request if err := s.storeClient.UpdateSessionLastActivity(c.Request.Context(), sandbox.SessionID, time.Now()); err != nil { klog.Warningf("Failed to update sandbox with session-id %s last activity for request: %v", sandbox.SessionID, err) @@ -83,6 +89,45 @@ func (s *Server) handleInvoke(c *gin.Context, namespace, name, path, kind string } } +// checkSandboxOwnership verifies the caller owns the sandbox or not. +func (s *Server) checkSandboxOwnership(c *gin.Context, sandbox *types.SandboxInfo) bool { + if s.oidcValidator == nil { + return true + } + claims := extractClaims(c) + if claims == nil { + return true + } + + // Admin role bypasses RLAC ownership checks + if slices.Contains(claims.Roles, "admin") { + return true + } + + // Fail-closed: deny access to sandboxes without an owner record + if sandbox.OwnerID == "" { + klog.V(2).Infof("RLAC denied: sandbox %s has no owner record (session: %s, caller: %s)", + sandbox.Name, sandbox.SessionID, claims.Subject) + c.JSON(http.StatusForbidden, gin.H{ + "error": "sandbox has no owner record", + "code": "FORBIDDEN", + }) + c.Abort() + return false + } + if sandbox.OwnerID != claims.Subject { + klog.V(2).Infof("RLAC denied: sandbox %s owned by %s, caller is %s", + sandbox.Name, sandbox.OwnerID, claims.Subject) + c.JSON(http.StatusForbidden, gin.H{ + "error": "you do not own this sandbox", + "code": "FORBIDDEN", + }) + c.Abort() + return false + } + return true +} + func (s *Server) handleGetSandboxError(c *gin.Context, err error) { // Fallback for other APIStatus errors if statusErr, ok := err.(apierrors.APIStatus); ok { @@ -240,6 +285,14 @@ func (s *Server) generateSandboxJWT(c *gin.Context, sandbox *types.SandboxInfo) claims := map[string]interface{}{ "session_id": sandbox.SessionID, } + + if oidcClaims := extractClaims(c); oidcClaims != nil { + claims["user_sub"] = oidcClaims.Subject + if oidcClaims.Email != "" { + claims["user_email"] = oidcClaims.Email + } + } + token, err := s.jwtManager.GenerateToken(claims) if err != nil { klog.Errorf("Failed to generate JWT token (session: %s): %v", sandbox.SessionID, err) diff --git a/pkg/router/handlers_test.go b/pkg/router/handlers_test.go index adeed670..624c3a5d 100644 --- a/pkg/router/handlers_test.go +++ b/pkg/router/handlers_test.go @@ -553,3 +553,97 @@ func TestConcurrencyLimitMiddleware_Overload(t *testing.T) { // Wait for first request to complete <-done } + +func TestCheckSandboxOwnership(t *testing.T) { + setupEnv() + defer teardownEnv() + + // Minimal mock OIDC validator (non-nil means auth is enabled) + mockValidator := &OIDCValidator{} + + tests := []struct { + name string + oidcValidator *OIDCValidator + claims *Claims + sandboxOwnerID string + expectAllowed bool + expectCode int + }{ + { + name: "auth disabled", + oidcValidator: nil, + sandboxOwnerID: "", + expectAllowed: true, + }, + { + name: "no claims in context", + oidcValidator: mockValidator, + claims: nil, + sandboxOwnerID: "user-123", + expectAllowed: true, + }, + { + name: "matching owner", + oidcValidator: mockValidator, + claims: &Claims{Subject: "user-123", Roles: []string{"sandbox:invoke"}}, + sandboxOwnerID: "user-123", + expectAllowed: true, + }, + { + name: "mismatched owner", + oidcValidator: mockValidator, + claims: &Claims{Subject: "user-456", Roles: []string{"sandbox:invoke"}}, + sandboxOwnerID: "user-123", + expectAllowed: false, + expectCode: http.StatusForbidden, + }, + { + name: "empty owner fail-closed", + oidcValidator: mockValidator, + claims: &Claims{Subject: "user-123", Roles: []string{"sandbox:invoke"}}, + sandboxOwnerID: "", + expectAllowed: false, + expectCode: http.StatusForbidden, + }, + { + name: "admin bypasses RLAC", + oidcValidator: mockValidator, + claims: &Claims{Subject: "admin-user", Roles: []string{"admin", "sandbox:invoke"}}, + sandboxOwnerID: "user-123", + expectAllowed: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + server := &Server{ + config: &Config{Port: "8080"}, + oidcValidator: tt.oidcValidator, + } + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request, _ = http.NewRequest("POST", "/test", nil) + + // Set claims in context if provided + if tt.claims != nil { + c.Set(string(contextKeyOIDCClaims), tt.claims) + } + + sandbox := &types.SandboxInfo{ + Name: "test-sandbox", + SessionID: "test-session", + OwnerID: tt.sandboxOwnerID, + } + + allowed := server.checkSandboxOwnership(c, sandbox) + + if allowed != tt.expectAllowed { + t.Errorf("expected allowed=%v, got %v", tt.expectAllowed, allowed) + } + if !tt.expectAllowed && w.Code != tt.expectCode { + t.Errorf("expected status %d, got %d", tt.expectCode, w.Code) + } + }) + } +} diff --git a/pkg/router/oidc.go b/pkg/router/oidc.go new file mode 100644 index 00000000..7dd1999a --- /dev/null +++ b/pkg/router/oidc.go @@ -0,0 +1,253 @@ +/* +Copyright The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package router + +import ( + "context" + "encoding/base64" + "encoding/json" + "fmt" + "slices" + "strings" + "time" + + gooidc "github.com/coreos/go-oidc/v3/oidc" +) + +// OIDCConfig holds provider-agnostic OIDC configuration. +type OIDCConfig struct { + // IssuerURL is the OIDC provider issuer URL. + IssuerURL string + + // Audience is the expected aud claim in the JWT. + Audience string + + // RolesClaim is the JSON path to the roles array. + RolesClaim string +} + +// Claims represents validated identity extracted from an OIDC access token. +type Claims struct { + // Subject is the standard "sub" claim identifying the user. + Subject string + + // Email is the standard "email" claim (may be empty). + Email string + + // Roles extracted from the configured RolesClaim path. + Roles []string +} + +// OIDCValidator validates OIDC access tokens using cached JWKS keys. +type OIDCValidator struct { + keySet gooidc.KeySet // JWKS key set with automatic caching/rotation + issuer string // expected issuer + audience string // expected audience + rolesClaim string // dot-notation path to roles array +} + +// NewOIDCValidator creates a new OIDCValidator instance. +func NewOIDCValidator(ctx context.Context, cfg OIDCConfig) (*OIDCValidator, error) { + provider, err := gooidc.NewProvider(ctx, cfg.IssuerURL) + if err != nil { + return nil, fmt.Errorf("failed to discover OIDC provider at %s: %w", cfg.IssuerURL, err) + } + + // Extract the JWKS URL from the provider's discovery document. + var providerClaims struct { + JWKSURL string `json:"jwks_uri"` + } + if err := provider.Claims(&providerClaims); err != nil { + return nil, fmt.Errorf("failed to extract jwks_uri from OIDC discovery: %w", err) + } + + // Create a RemoteKeySet for JWKS caching and automatic key rotation. + keySet := gooidc.NewRemoteKeySet(ctx, providerClaims.JWKSURL) + + return &OIDCValidator{ + keySet: keySet, + issuer: cfg.IssuerURL, + audience: cfg.Audience, + rolesClaim: cfg.RolesClaim, + }, nil +} + +// ValidateToken verifies the JWT signature, standard claims, and extracts roles. +func (v *OIDCValidator) ValidateToken(ctx context.Context, rawToken string) (*Claims, error) { + if rawToken == "" { + return nil, fmt.Errorf("empty token") + } + + // Verify signing algorithm before signature check + if err := v.checkTokenAlgorithm(rawToken); err != nil { + return nil, err + } + + // Verify the JWT signature using the cached JWKS keys. + payload, err := v.keySet.VerifySignature(ctx, rawToken) + if err != nil { + return nil, fmt.Errorf("token signature verification failed: %w", err) + } + + // Parse all claims into a single map to avoid double-unmarshaling overhead. + var allClaims map[string]interface{} + if err := json.Unmarshal(payload, &allClaims); err != nil { + return nil, fmt.Errorf("failed to parse token claims: %w", err) + } + + // Extract standard claims + issuer, _ := allClaims["iss"].(string) + subject, _ := allClaims["sub"].(string) + email, _ := allClaims["email"].(string) + expiry, _ := allClaims["exp"].(float64) + notBefore, _ := allClaims["nbf"].(float64) + + audiences := parseAudienceClaim(allClaims["aud"]) + + // Validate issuer + if issuer != v.issuer { + return nil, fmt.Errorf("invalid issuer: got %q, expected %q", issuer, v.issuer) + } + + // Validate subject + if subject == "" { + return nil, fmt.Errorf("token missing required sub claim") + } + + // Validate expiration + if expiry == 0 { + return nil, fmt.Errorf("token missing required exp claim") + } + if time.Now().After(time.Unix(int64(expiry), 0)) { + return nil, fmt.Errorf("token has expired") + } + + // Validate not-before (if present) + if notBefore != 0 && time.Now().Before(time.Unix(int64(notBefore), 0)) { + return nil, fmt.Errorf("token is not yet valid") + } + + // Validate audience + if v.audience != "" { + if !slices.Contains(audiences, v.audience) { + return nil, fmt.Errorf("invalid audience: token audiences %v do not include %q", audiences, v.audience) + } + } + + // Extract roles from the map using the configured path. + roles := extractRolesFromClaims(allClaims, v.rolesClaim) + + return &Claims{ + Subject: subject, + Email: email, + Roles: roles, + }, nil +} + +// extractRolesFromClaims navigates a nested claims map using a dot-separated path. +func extractRolesFromClaims(claims map[string]interface{}, path string) []string { + if path == "" { + return nil + } + + parts := strings.Split(path, ".") + var current interface{} = claims + + for _, part := range parts { + m, ok := current.(map[string]interface{}) + if !ok { + return nil + } + current, ok = m[part] + if !ok { + return nil + } + } + + // The final value should be an array of strings. + arr, ok := current.([]interface{}) + if !ok { + return nil + } + + var roles []string + for _, v := range arr { + if s, ok := v.(string); ok { + roles = append(roles, s) + } + } + return roles +} + +// parseAudienceClaim extracts the "aud" claim which can be either a string or []string. +func parseAudienceClaim(audVal interface{}) []string { + if audVal == nil { + return nil + } + switch v := audVal.(type) { + case string: + return []string{v} + case []interface{}: + var audiences []string + for _, item := range v { + if s, ok := item.(string); ok { + audiences = append(audiences, s) + } + } + return audiences + } + return nil +} + +// allowedSigningAlgs contains supported asymmetric algorithms for JWT signatures. +var allowedSigningAlgs = map[string]bool{ + "RS256": true, + "RS384": true, + "RS512": true, + "ES256": true, + "ES384": true, + "ES512": true, + "PS256": true, + "PS384": true, + "PS512": true, + "EdDSA": true, +} + +// checkTokenAlgorithm verifies the JWT signing algorithm is supported. +func (v *OIDCValidator) checkTokenAlgorithm(rawToken string) error { + parts := strings.SplitN(rawToken, ".", 3) + if len(parts) != 3 { + return fmt.Errorf("malformed JWT: expected 3 parts, got %d", len(parts)) + } + + headerBytes, err := base64.RawURLEncoding.DecodeString(parts[0]) + if err != nil { + return fmt.Errorf("malformed JWT header: %w", err) + } + + var header struct { + Alg string `json:"alg"` + } + if err := json.Unmarshal(headerBytes, &header); err != nil { + return fmt.Errorf("malformed JWT header JSON: %w", err) + } + + if !allowedSigningAlgs[header.Alg] { + return fmt.Errorf("signing algorithm not supported: %q", header.Alg) + } + return nil +} diff --git a/pkg/router/oidc_test.go b/pkg/router/oidc_test.go new file mode 100644 index 00000000..160a08e9 --- /dev/null +++ b/pkg/router/oidc_test.go @@ -0,0 +1,293 @@ +/* +Copyright The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package router + +import ( + "context" + "crypto/rand" + "crypto/rsa" + "encoding/base64" + "encoding/json" + "math/big" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/golang-jwt/jwt/v5" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// testOIDCServer creates an httptest.Server that serves: +// 1. GET /.well-known/openid-configuration → OIDC discovery JSON +// 2. GET /jwks → JWKS key set JSON +func testOIDCServer(t *testing.T) (*httptest.Server, *rsa.PrivateKey) { + t.Helper() + + privateKey, err := rsa.GenerateKey(rand.Reader, 2048) + require.NoError(t, err) + + var issuer string + mux := http.NewServeMux() + + mux.HandleFunc("/.well-known/openid-configuration", func(w http.ResponseWriter, _ *http.Request) { + discovery := map[string]interface{}{ + "issuer": issuer, + "jwks_uri": issuer + "/jwks", + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(discovery) + }) + + mux.HandleFunc("/jwks", func(w http.ResponseWriter, _ *http.Request) { + jwks := map[string]interface{}{ + "keys": []map[string]interface{}{ + { + "kty": "RSA", + "kid": "test-key-1", + "use": "sig", + "alg": "RS256", + "n": base64.RawURLEncoding.EncodeToString(privateKey.N.Bytes()), + "e": base64.RawURLEncoding.EncodeToString(big.NewInt(int64(privateKey.E)).Bytes()), + }, + }, + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(jwks) + }) + + ts := httptest.NewServer(mux) + issuer = ts.URL + return ts, privateKey +} + +// mintTestJWT creates a signed JWT for testing with configurable claims. +func mintTestJWT(t *testing.T, privateKey *rsa.PrivateKey, claims jwt.MapClaims) string { + t.Helper() + token := jwt.NewWithClaims(jwt.SigningMethodRS256, claims) + token.Header["kid"] = "test-key-1" + tokenString, err := token.SignedString(privateKey) + require.NoError(t, err) + return tokenString +} + +func TestNewOIDCValidator(t *testing.T) { + ts, _ := testOIDCServer(t) + defer ts.Close() + + cfg := OIDCConfig{ + IssuerURL: ts.URL, + Audience: "agentcube-api", + RolesClaim: "realm_access.roles", + } + + validator, err := NewOIDCValidator(context.Background(), cfg) + require.NoError(t, err) + assert.NotNil(t, validator) + assert.Equal(t, ts.URL, validator.issuer) + assert.Equal(t, "agentcube-api", validator.audience) + assert.Equal(t, "realm_access.roles", validator.rolesClaim) +} + +func TestValidateToken_ValidToken(t *testing.T) { + ts, privateKey := testOIDCServer(t) + defer ts.Close() + + validator, err := NewOIDCValidator(context.Background(), OIDCConfig{ + IssuerURL: ts.URL, + Audience: "agentcube-api", + RolesClaim: "realm_access.roles", + }) + require.NoError(t, err) + + rawToken := mintTestJWT(t, privateKey, jwt.MapClaims{ + "iss": ts.URL, + "sub": "user-123", + "aud": "agentcube-api", + "email": "test@example.com", + "exp": time.Now().Add(5 * time.Minute).Unix(), + "iat": time.Now().Unix(), + "realm_access": map[string]interface{}{ + "roles": []interface{}{"sandbox:invoke", "sandbox:manage"}, + }, + }) + + claims, err := validator.ValidateToken(context.Background(), rawToken) + require.NoError(t, err) + assert.Equal(t, "user-123", claims.Subject) + assert.Equal(t, "test@example.com", claims.Email) + assert.Equal(t, []string{"sandbox:invoke", "sandbox:manage"}, claims.Roles) +} + +func TestValidateToken_ExpiredToken(t *testing.T) { + ts, privateKey := testOIDCServer(t) + defer ts.Close() + + validator, err := NewOIDCValidator(context.Background(), OIDCConfig{ + IssuerURL: ts.URL, + Audience: "agentcube-api", + }) + require.NoError(t, err) + + rawToken := mintTestJWT(t, privateKey, jwt.MapClaims{ + "iss": ts.URL, + "sub": "user-123", + "aud": "agentcube-api", + "exp": time.Now().Add(-1 * time.Hour).Unix(), + "iat": time.Now().Add(-2 * time.Hour).Unix(), + }) + + _, err = validator.ValidateToken(context.Background(), rawToken) + assert.Error(t, err) + assert.Contains(t, err.Error(), "expired") +} + +func TestValidateToken_WrongAudience(t *testing.T) { + ts, privateKey := testOIDCServer(t) + defer ts.Close() + + validator, err := NewOIDCValidator(context.Background(), OIDCConfig{ + IssuerURL: ts.URL, + Audience: "agentcube-api", + }) + require.NoError(t, err) + + rawToken := mintTestJWT(t, privateKey, jwt.MapClaims{ + "iss": ts.URL, + "sub": "user-123", + "aud": "wrong-audience", + "exp": time.Now().Add(5 * time.Minute).Unix(), + }) + + _, err = validator.ValidateToken(context.Background(), rawToken) + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid audience") +} + +func TestValidateToken_WrongIssuer(t *testing.T) { + ts, privateKey := testOIDCServer(t) + defer ts.Close() + + validator, err := NewOIDCValidator(context.Background(), OIDCConfig{ + IssuerURL: ts.URL, + Audience: "agentcube-api", + }) + require.NoError(t, err) + + rawToken := mintTestJWT(t, privateKey, jwt.MapClaims{ + "iss": "http://wrong-issuer.example.com", + "sub": "user-123", + "aud": "agentcube-api", + "exp": time.Now().Add(5 * time.Minute).Unix(), + }) + + _, err = validator.ValidateToken(context.Background(), rawToken) + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid issuer") +} + +func TestValidateToken_InvalidSignature(t *testing.T) { + ts, _ := testOIDCServer(t) + defer ts.Close() + + validator, err := NewOIDCValidator(context.Background(), OIDCConfig{ + IssuerURL: ts.URL, + Audience: "agentcube-api", + }) + require.NoError(t, err) + + // Sign with a different key than what the JWKS endpoint serves + wrongKey, err := rsa.GenerateKey(rand.Reader, 2048) + require.NoError(t, err) + + rawToken := mintTestJWT(t, wrongKey, jwt.MapClaims{ + "iss": ts.URL, + "sub": "user-123", + "aud": "agentcube-api", + "exp": time.Now().Add(5 * time.Minute).Unix(), + }) + + _, err = validator.ValidateToken(context.Background(), rawToken) + assert.Error(t, err) + assert.Contains(t, err.Error(), "signature verification failed") +} + +func TestExtractRolesFromClaims(t *testing.T) { + tests := []struct { + name string + claims map[string]interface{} + path string + expected []string + }{ + { + name: "Keycloak nested path", + claims: map[string]interface{}{ + "realm_access": map[string]interface{}{ + "roles": []interface{}{"sandbox:invoke", "sandbox:manage"}, + }, + }, + path: "realm_access.roles", + expected: []string{"sandbox:invoke", "sandbox:manage"}, + }, + { + name: "missing intermediate key returns nil", + claims: map[string]interface{}{"other": "value"}, + path: "realm_access.roles", + expected: nil, + }, + { + name: "non-array final value returns nil", + claims: map[string]interface{}{ + "realm_access": map[string]interface{}{ + "roles": "not-an-array", + }, + }, + path: "realm_access.roles", + expected: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := extractRolesFromClaims(tt.claims, tt.path) + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestValidateToken_MissingSubject(t *testing.T) { + ts, privateKey := testOIDCServer(t) + defer ts.Close() + + validator, err := NewOIDCValidator(context.Background(), OIDCConfig{ + IssuerURL: ts.URL, + Audience: "agentcube-api", + }) + require.NoError(t, err) + + // Mint a token with no "sub" claim + rawToken := mintTestJWT(t, privateKey, jwt.MapClaims{ + "iss": ts.URL, + "aud": "agentcube-api", + "exp": time.Now().Add(5 * time.Minute).Unix(), + }) + + _, err = validator.ValidateToken(context.Background(), rawToken) + assert.Error(t, err) + assert.Contains(t, err.Error(), "token missing required sub claim") +} diff --git a/pkg/router/server.go b/pkg/router/server.go index 6e571c73..ff7adadf 100644 --- a/pkg/router/server.go +++ b/pkg/router/server.go @@ -39,6 +39,7 @@ type Server struct { storeClient store.Store httpTransport *http.Transport // Reusable HTTP transport for connection pooling jwtManager *JWTManager // JWT manager for signing requests to sandboxes + oidcValidator *OIDCValidator // nil when external auth is disabled } // NewServer creates a new Router API server instance @@ -58,8 +59,20 @@ func NewServer(config *Config) (*Server, error) { config.InitialConnectRetryInterval = 200 * time.Millisecond } - // Create session manager with store client and mTLS config - sessionManager, err := NewSessionManager(store.Storage(), &config.MTLSConfig) + // Initialize JWT manager for signing requests to sandboxes. + jwtManager, err := NewJWTManager() + if err != nil { + return nil, fmt.Errorf("failed to create JWT manager: %w", err) + } + + // Try to load existing keys from secret or store new ones + if err := jwtManager.TryStoreOrLoadJWTKeySecret(context.Background()); err != nil { + return nil, fmt.Errorf("failed to store/load JWT key secret: %w", err) + } + klog.Info("JWT manager initialized successfully") + + // Create session manager with store client, mTLS config, and JWT manager + sessionManager, err := NewSessionManager(store.Storage(), &config.MTLSConfig, jwtManager) if err != nil { return nil, fmt.Errorf("failed to create session manager: %w", err) } @@ -82,22 +95,31 @@ func NewServer(config *Config) (*Server, error) { sessionManager: sessionManager, storeClient: store.Storage(), httpTransport: httpTransport, + jwtManager: jwtManager, } - // Initialize JWT manager for signing requests to sandboxes - jwtManager, err := NewJWTManager() - if err != nil { - return nil, fmt.Errorf("failed to create JWT manager: %w", err) - } - - // Try to load existing keys from secret or store new ones - if err := jwtManager.TryStoreOrLoadJWTKeySecret(context.Background()); err != nil { - return nil, fmt.Errorf("failed to store/load JWT key secret: %w", err) + // Initialize OIDC validator when issuer URL is configured. + if config.OIDCIssuerURL != "" { + if config.OIDCRolesClaim == "" { + return nil, fmt.Errorf("--oidc-roles-claim is required when --oidc-issuer-url is set") + } + if config.OIDCRequiredRole == "" { + return nil, fmt.Errorf("--oidc-required-role is required when --oidc-issuer-url is set") + } + oidcCfg := OIDCConfig{ + IssuerURL: config.OIDCIssuerURL, + Audience: config.OIDCAudience, + RolesClaim: config.OIDCRolesClaim, + } + validator, err := NewOIDCValidator(context.Background(), oidcCfg) + if err != nil { + return nil, fmt.Errorf("failed to initialize OIDC validator: %w", err) + } + server.oidcValidator = validator + klog.Infof("External authentication enabled: issuer=%s, audience=%s, rolesClaim=%s, requiredRole=%s", + config.OIDCIssuerURL, config.OIDCAudience, config.OIDCRolesClaim, config.OIDCRequiredRole) } - server.jwtManager = jwtManager - klog.Info("JWT manager initialized successfully") - // Setup routes server.setupRoutes() @@ -141,7 +163,12 @@ func (s *Server) setupRoutes() { // Add middleware v1.Use(gin.Logger()) v1.Use(gin.Recovery()) - + // External auth: validate OIDC JWT (no-op when oidcValidator is nil) + v1.Use(s.oidcAuthMiddleware()) + // RBAC: require the configured role (only when auth is enabled) + if s.oidcValidator != nil { + v1.Use(requireRole(s.config.OIDCRequiredRole)) + } v1.Use(s.concurrencyLimitMiddleware()) // Apply concurrency limit to API routes // Agent invoke requests (support GET/POST, since downstream uses these methods) diff --git a/pkg/router/session_manager.go b/pkg/router/session_manager.go index 960f3404..7d8ed0dc 100644 --- a/pkg/router/session_manager.go +++ b/pkg/router/session_manager.go @@ -56,13 +56,15 @@ type manager struct { workloadMgrAddr string httpClient *http.Client closeMTLS func() + jwtManager *JWTManager } // NewSessionManager returns a SessionManager implementation. // storeClient is used to query sandbox information from store // workloadMgrAddr is read from the environment variable WORKLOAD_MANAGER_URL. // When mtlsCfg includes cert, key, and CA paths, the HTTP client uses mTLS to connect to WorkloadManager. -func NewSessionManager(storeClient store.Store, mtlsCfg *mtls.Config) (SessionManager, error) { +// jwtMgr is used to sign identity JWTs forwarded to WorkloadManager (may be nil when auth is disabled). +func NewSessionManager(storeClient store.Store, mtlsCfg *mtls.Config, jwtMgr *JWTManager) (SessionManager, error) { workloadMgrAddr := os.Getenv("WORKLOAD_MANAGER_URL") if workloadMgrAddr == "" { return nil, fmt.Errorf("WORKLOAD_MANAGER_URL environment variable is not set") @@ -113,6 +115,7 @@ func NewSessionManager(storeClient store.Store, mtlsCfg *mtls.Config) (SessionMa storeClient: storeClient, workloadMgrAddr: workloadMgrAddr, closeMTLS: closeMTLS, + jwtManager: jwtMgr, httpClient: &http.Client{ Timeout: 2 * time.Minute, // consistent with manager setting Transport: transport, @@ -183,6 +186,8 @@ func (m *manager) createSandbox(ctx context.Context, namespace string, name stri if token := loadWorkloadManagerAuthToken(); token != "" { req.Header.Set("Authorization", "Bearer "+token) } + // Forward external user identity for downstream tracking. + m.setIdentityHeader(ctx, req) // Send the request resp, err := m.httpClient.Do(req) @@ -227,12 +232,37 @@ func (m *manager) createSandbox(ctx context.Context, namespace string, name stri SandboxID: res.SandboxID, Name: res.SandboxName, SessionID: res.SessionID, + OwnerID: res.OwnerID, EntryPoints: res.EntryPoints, } return sandbox, nil } +// setIdentityHeader signs and attaches the caller's identity as a JWT. +func (m *manager) setIdentityHeader(ctx context.Context, req *http.Request) { + if m.jwtManager == nil { + return + } + claims, ok := ctx.Value(contextKeyOIDCClaims).(*Claims) + if !ok || claims == nil { + return + } + identityClaims := map[string]interface{}{ + "sub": claims.Subject, + "aud": "workloadmanager", + } + if claims.Email != "" { + identityClaims["email"] = claims.Email + } + identityToken, err := m.jwtManager.GenerateToken(identityClaims) + if err != nil { + klog.Warningf("Failed to sign identity JWT for WM: %v", err) + return + } + req.Header.Set("X-AgentCube-User-Identity", identityToken) +} + func loadWorkloadManagerAuthToken() string { b, err := os.ReadFile(serviceAccountTokenPath) if err != nil { diff --git a/pkg/router/session_manager_test.go b/pkg/router/session_manager_test.go index 8b585f4b..9c6a0dab 100644 --- a/pkg/router/session_manager_test.go +++ b/pkg/router/session_manager_test.go @@ -557,7 +557,7 @@ func TestNewSessionManager_MTLSEnabled_ValidCerts(t *testing.T) { t.Setenv("WORKLOAD_MANAGER_URL", "https://localhost:8080") cfg := &mtls.Config{CertFile: certFile, KeyFile: keyFile, CAFile: caFile} - sm, err := NewSessionManager(&fakeStoreClient{}, cfg) + sm, err := NewSessionManager(&fakeStoreClient{}, cfg, nil) if err != nil { t.Fatalf("NewSessionManager with mTLS failed: %v", err) } @@ -590,7 +590,7 @@ func TestNewSessionManager_MTLSDisabled(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - sm, err := NewSessionManager(&fakeStoreClient{}, tt.cfg) + sm, err := NewSessionManager(&fakeStoreClient{}, tt.cfg, nil) if err != nil { t.Fatalf("NewSessionManager without mTLS failed: %v", err) } @@ -674,3 +674,71 @@ func writePEMFile(t *testing.T, path, blockType string, data []byte) { t.Fatalf("encode PEM %s: %v", path, err) } } + +// ---- tests: setIdentityHeader ---- + +func TestSetIdentityHeader_WithClaims(t *testing.T) { + jwtMgr, err := NewJWTManager() + if err != nil { + t.Fatalf("failed to create JWT manager: %v", err) + } + + m := &manager{ + jwtManager: jwtMgr, + } + + req, _ := http.NewRequest("GET", "/", nil) + claims := &Claims{ + Subject: "user-123", + Email: "test@example.com", + } + ctx := context.WithValue(context.Background(), contextKeyOIDCClaims, claims) + + m.setIdentityHeader(ctx, req) + + token := req.Header.Get("X-AgentCube-User-Identity") + if token == "" { + t.Fatalf("expected X-AgentCube-User-Identity header to be set") + } +} + +func TestSetIdentityHeader_NoClaimsOrNoManager(t *testing.T) { + jwtMgr, err := NewJWTManager() + if err != nil { + t.Fatalf("failed to create JWT manager: %v", err) + } + + tests := []struct { + name string + jwtManager *JWTManager + ctx context.Context + }{ + { + name: "nil jwt manager", + jwtManager: nil, + ctx: context.WithValue(context.Background(), contextKeyOIDCClaims, &Claims{Subject: "user"}), + }, + { + name: "no claims in context", + jwtManager: jwtMgr, + ctx: context.Background(), + }, + { + name: "wrong type in context", + jwtManager: jwtMgr, + ctx: context.WithValue(context.Background(), contextKeyOIDCClaims, "not-a-claims-object"), + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + m := &manager{jwtManager: tt.jwtManager} + req, _ := http.NewRequest("GET", "/", nil) + m.setIdentityHeader(tt.ctx, req) + + if token := req.Header.Get("X-AgentCube-User-Identity"); token != "" { + t.Errorf("expected no identity header, got %q", token) + } + }) + } +} diff --git a/pkg/workloadmanager/handlers.go b/pkg/workloadmanager/handlers.go index 7d417c6e..3d03d47d 100644 --- a/pkg/workloadmanager/handlers.go +++ b/pkg/workloadmanager/handlers.go @@ -120,6 +120,11 @@ func (s *Server) handleSandboxCreate(c *gin.Context, kind string) { return } + // Set ownership from the Router-signed identity JWT + if ownerID := extractOwnerID(c.Request); ownerID != "" { + sandboxEntry.OwnerID = ownerID + } + // Calculate sandbox name and namespace before creating sandboxName := sandbox.Name namespace := sandbox.Namespace @@ -143,38 +148,43 @@ func (s *Server) handleSandboxCreate(c *gin.Context, kind string) { response, err := s.createSandbox(c.Request.Context(), dynamicClient, sandbox, sandboxClaim, sandboxEntry, resultChan) if err != nil { - // Client disconnected — abort with 499 so logs/metrics reflect the cancellation. - if errors.Is(err, context.Canceled) { - klog.Warningf("create sandbox aborted %s/%s: client disconnected", sandbox.Namespace, sandbox.Name) - c.AbortWithStatus(499) - return - } - // Deadline exceeded — client may still be connected; return 504 so they get a meaningful response. - if errors.Is(err, context.DeadlineExceeded) { - klog.Warningf("create sandbox timed out %s/%s: request deadline exceeded", sandbox.Namespace, sandbox.Name) - respondError(c, http.StatusGatewayTimeout, "request timed out") - return - } - // Internal sandbox-ready wait timed out; surface as 504 rather than a generic 500. - if errors.Is(err, errSandboxCreationTimeout) { - klog.Warningf("create sandbox timed out %s/%s: sandbox did not become ready within deadline", sandbox.Namespace, sandbox.Name) - respondError(c, http.StatusGatewayTimeout, err.Error()) - return - } - klog.Errorf("create sandbox failed %s/%s: %v", sandbox.Namespace, sandbox.Name, err) - // Internal errors (store, K8s API) must not leak system details to callers; - // sandbox-level failures (terminal pod state, timeout) are safe to surface. - msg := err.Error() - if apierrors.IsInternalError(err) { - msg = "internal server error" - } - respondError(c, http.StatusInternalServerError, msg) + respondCreateError(c, sandbox.Namespace, sandbox.Name, err) return } respondJSON(c, http.StatusOK, response) } +// respondCreateError maps sandbox-creation errors to the appropriate HTTP response. +func respondCreateError(c *gin.Context, namespace, name string, err error) { + // Client disconnected — abort with 499 so logs/metrics reflect the cancellation. + if errors.Is(err, context.Canceled) { + klog.Warningf("create sandbox aborted %s/%s: client disconnected", namespace, name) + c.AbortWithStatus(499) + return + } + // Deadline exceeded — client may still be connected; return 504 so they get a meaningful response. + if errors.Is(err, context.DeadlineExceeded) { + klog.Warningf("create sandbox timed out %s/%s: request deadline exceeded", namespace, name) + respondError(c, http.StatusGatewayTimeout, "request timed out") + return + } + // Internal sandbox-ready wait timed out; surface as 504 rather than a generic 500. + if errors.Is(err, errSandboxCreationTimeout) { + klog.Warningf("create sandbox timed out %s/%s: sandbox did not become ready within deadline", namespace, name) + respondError(c, http.StatusGatewayTimeout, err.Error()) + return + } + klog.Errorf("create sandbox failed %s/%s: %v", namespace, name, err) + // Internal errors (store, K8s API) must not leak system details to callers; + // sandbox-level failures (terminal pod state, timeout) are safe to surface. + msg := err.Error() + if apierrors.IsInternalError(err) { + msg = "internal server error" + } + respondError(c, http.StatusInternalServerError, msg) +} + // createK8sResources creates the K8s sandbox or sandbox claim resource. func (s *Server) createK8sResources(ctx context.Context, dynamicClient dynamic.Interface, sandbox *sandboxv1alpha1.Sandbox, sandboxClaim *extensionsv1alpha1.SandboxClaim) error { if sandboxClaim != nil { @@ -269,6 +279,7 @@ func (s *Server) createSandbox(ctx context.Context, dynamicClient dynamic.Interf SandboxID: storeCacheInfo.SandboxID, SandboxName: sandbox.Name, EntryPoints: storeCacheInfo.EntryPoints, + OwnerID: sandboxEntry.OwnerID, } if err := s.storeClient.UpdateSandbox(ctx, storeCacheInfo); err != nil { diff --git a/pkg/workloadmanager/identity.go b/pkg/workloadmanager/identity.go new file mode 100644 index 00000000..e3fb2cd8 --- /dev/null +++ b/pkg/workloadmanager/identity.go @@ -0,0 +1,105 @@ +/* +Copyright The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package workloadmanager + +import ( + "crypto/rsa" + "crypto/sha256" + "crypto/x509" + "encoding/hex" + "encoding/pem" + "fmt" + "net/http" + + "github.com/golang-jwt/jwt/v5" + "k8s.io/klog/v2" +) + +// identityJWTHeader is the header carrying the Router-signed user identity. +const identityJWTHeader = "X-AgentCube-User-Identity" + +// verifyIdentityJWT parses the Router-signed identity JWT, verifies the RSA signature, validates aud=="workloadmanager", and returns the sub claim. +func verifyIdentityJWT(publicKeyPEM string, rawToken string) (string, error) { + block, _ := pem.Decode([]byte(publicKeyPEM)) + if block == nil { + return "", fmt.Errorf("failed to decode public key PEM") + } + + pub, err := x509.ParsePKIXPublicKey(block.Bytes) + if err != nil { + return "", fmt.Errorf("failed to parse public key: %w", err) + } + + rsaPub, ok := pub.(*rsa.PublicKey) + if !ok { + return "", fmt.Errorf("public key is not RSA") + } + + token, err := jwt.Parse(rawToken, func(_ *jwt.Token) (interface{}, error) { + return rsaPub, nil + }, jwt.WithValidMethods([]string{"RS256"}), jwt.WithExpirationRequired()) + if err != nil { + return "", fmt.Errorf("token verification failed: %w", err) + } + + claims, ok := token.Claims.(jwt.MapClaims) + if !ok { + return "", fmt.Errorf("unexpected claims type") + } + + // Validate audience + aud, _ := claims["aud"].(string) + if aud != "workloadmanager" { + return "", fmt.Errorf("invalid audience: %q", aud) + } + + sub, _ := claims["sub"].(string) + if sub == "" { + return "", fmt.Errorf("missing sub claim") + } + + return sub, nil +} + +// extractOwnerID reads the identity JWT from the request header and returns the verified subject, or empty string if the header is absent or invalid. +func extractOwnerID(r *http.Request) string { + rawToken := r.Header.Get(identityJWTHeader) + if rawToken == "" { + return "" + } + + publicKey := GetCachedPublicKey() + if publicKey == "" { + klog.V(2).Info("Identity JWT present but public key not cached, skipping owner extraction") + return "" + } + + sub, err := verifyIdentityJWT(publicKey, rawToken) + if err != nil { + klog.V(2).Infof("Identity JWT verification failed: %v", err) + return "" + } + + return sub +} + +// sha256Short returns the first 63 characters of the hex-encoded SHA-256 hash. +func sha256Short(s string) string { + h := sha256.Sum256([]byte(s)) + full := hex.EncodeToString(h[:]) + return full[:63] +} diff --git a/pkg/workloadmanager/identity_test.go b/pkg/workloadmanager/identity_test.go new file mode 100644 index 00000000..eb07225f --- /dev/null +++ b/pkg/workloadmanager/identity_test.go @@ -0,0 +1,171 @@ +/* +Copyright The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package workloadmanager + +import ( + "crypto/rand" + "crypto/rsa" + "crypto/x509" + "encoding/pem" + "testing" + "time" + + "github.com/golang-jwt/jwt/v5" +) + +func generateTestKeyPair(t *testing.T) (*rsa.PrivateKey, string) { + t.Helper() + privateKey, err := rsa.GenerateKey(rand.Reader, 2048) + if err != nil { + t.Fatalf("failed to generate RSA key: %v", err) + } + + pubBytes, err := x509.MarshalPKIXPublicKey(&privateKey.PublicKey) + if err != nil { + t.Fatalf("failed to marshal public key: %v", err) + } + + pubPEM := pem.EncodeToMemory(&pem.Block{ + Type: "PUBLIC KEY", + Bytes: pubBytes, + }) + + return privateKey, string(pubPEM) +} + +func signTestToken(t *testing.T, key *rsa.PrivateKey, claims jwt.MapClaims) string { + t.Helper() + token := jwt.NewWithClaims(jwt.SigningMethodRS256, claims) + signed, err := token.SignedString(key) + if err != nil { + t.Fatalf("failed to sign token: %v", err) + } + return signed +} + +func TestVerifyIdentityJWT_ValidToken(t *testing.T) { + privKey, pubPEM := generateTestKeyPair(t) + + token := signTestToken(t, privKey, jwt.MapClaims{ + "sub": "user-123", + "aud": "workloadmanager", + "exp": time.Now().Add(5 * time.Minute).Unix(), + "iat": time.Now().Unix(), + }) + + sub, err := verifyIdentityJWT(pubPEM, token) + if err != nil { + t.Fatalf("expected no error, got: %v", err) + } + if sub != "user-123" { + t.Errorf("expected sub 'user-123', got %q", sub) + } +} + +func TestVerifyIdentityJWT_ExpiredToken(t *testing.T) { + privKey, pubPEM := generateTestKeyPair(t) + + token := signTestToken(t, privKey, jwt.MapClaims{ + "sub": "user-123", + "aud": "workloadmanager", + "exp": time.Now().Add(-5 * time.Minute).Unix(), + }) + + _, err := verifyIdentityJWT(pubPEM, token) + if err == nil { + t.Fatal("expected error for expired token") + } +} + +func TestVerifyIdentityJWT_WrongAudience(t *testing.T) { + privKey, pubPEM := generateTestKeyPair(t) + + token := signTestToken(t, privKey, jwt.MapClaims{ + "sub": "user-123", + "aud": "wrong-audience", + "exp": time.Now().Add(5 * time.Minute).Unix(), + }) + + _, err := verifyIdentityJWT(pubPEM, token) + if err == nil { + t.Fatal("expected error for wrong audience") + } +} + +func TestVerifyIdentityJWT_TamperedSignature(t *testing.T) { + privKey, pubPEM := generateTestKeyPair(t) + + token := signTestToken(t, privKey, jwt.MapClaims{ + "sub": "user-123", + "aud": "workloadmanager", + "exp": time.Now().Add(5 * time.Minute).Unix(), + }) + + // Use a completely different key to sign the same claims + otherKey, err := rsa.GenerateKey(rand.Reader, 2048) + if err != nil { + t.Fatalf("failed to generate second RSA key: %v", err) + } + tampered := signTestToken(t, otherKey, jwt.MapClaims{ + "sub": "user-123", + "aud": "workloadmanager", + "exp": time.Now().Add(5 * time.Minute).Unix(), + }) + + // Verify the original token works + _, err = verifyIdentityJWT(pubPEM, token) + if err != nil { + t.Fatalf("valid token should pass: %v", err) + } + + // Verify the tampered token (signed with different key) fails + _, err = verifyIdentityJWT(pubPEM, tampered) + if err == nil { + t.Fatal("expected error for token signed with different key") + } +} + +func TestVerifyIdentityJWT_MissingSub(t *testing.T) { + privKey, pubPEM := generateTestKeyPair(t) + + token := signTestToken(t, privKey, jwt.MapClaims{ + "aud": "workloadmanager", + "exp": time.Now().Add(5 * time.Minute).Unix(), + }) + + _, err := verifyIdentityJWT(pubPEM, token) + if err == nil { + t.Fatal("expected error for missing sub") + } +} + +func TestSha256Short(t *testing.T) { + result := sha256Short("test-user-id") + if len(result) != 63 { + t.Errorf("expected length 63, got %d", len(result)) + } + + // Verify deterministic + if sha256Short("test-user-id") != result { + t.Error("sha256Short should be deterministic") + } + + // Verify different inputs produce different outputs + if sha256Short("other-user-id") == result { + t.Error("different inputs should produce different hashes") + } +} diff --git a/pkg/workloadmanager/k8s_client.go b/pkg/workloadmanager/k8s_client.go index 1ebf0387..2c44d8e3 100644 --- a/pkg/workloadmanager/k8s_client.go +++ b/pkg/workloadmanager/k8s_client.go @@ -80,6 +80,7 @@ type K8sClient struct { type sandboxEntry struct { Kind string SessionID string + OwnerID string Ports []runtimev1alpha1.TargetPort IdleTimeout time.Duration } diff --git a/pkg/workloadmanager/sandbox_helper.go b/pkg/workloadmanager/sandbox_helper.go index 322b5d8d..0d0d467c 100644 --- a/pkg/workloadmanager/sandbox_helper.go +++ b/pkg/workloadmanager/sandbox_helper.go @@ -61,6 +61,7 @@ func buildSandboxPlaceHolder(sandboxCR *sandboxv1alpha1.Sandbox, entry *sandboxE return &types.SandboxInfo{ Kind: entry.Kind, SessionID: entry.SessionID, + OwnerID: entry.OwnerID, SandboxNamespace: sandboxCR.GetNamespace(), Name: sandboxCR.GetName(), ExpiresAt: expiresAt, @@ -94,6 +95,7 @@ func buildSandboxInfo(sandbox *sandboxv1alpha1.Sandbox, podIP string, entry *san SandboxNamespace: sandbox.GetNamespace(), EntryPoints: accesses, SessionID: entry.SessionID, + OwnerID: entry.OwnerID, CreatedAt: createdAt, ExpiresAt: expiresAt, Status: getSandboxStatus(sandbox), diff --git a/pkg/workloadmanager/sandbox_helper_test.go b/pkg/workloadmanager/sandbox_helper_test.go index 793108ad..1c8fc917 100644 --- a/pkg/workloadmanager/sandbox_helper_test.go +++ b/pkg/workloadmanager/sandbox_helper_test.go @@ -451,3 +451,68 @@ func TestGetSandboxStatus_TableDriven(t *testing.T) { }) } } + +func TestBuildSandboxPlaceHolder_OwnerID(t *testing.T) { + tests := []struct { + name string + ownerID string + }{ + {"with owner", "user-123"}, + {"without owner", ""}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + sandbox := &sandboxv1alpha1.Sandbox{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-sandbox", + Namespace: "default", + }, + } + entry := &sandboxEntry{ + Kind: types.SandboxKind, + SessionID: "session-123", + OwnerID: tt.ownerID, + IdleTimeout: 15 * time.Minute, + } + + info := buildSandboxPlaceHolder(sandbox, entry) + assert.Equal(t, tt.ownerID, info.OwnerID) + }) + } +} + +func TestBuildSandboxInfo_OwnerID(t *testing.T) { + tests := []struct { + name string + ownerID string + }{ + {"with owner", "user-456"}, + {"without owner", ""}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + sandbox := &sandboxv1alpha1.Sandbox{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-sandbox", + Namespace: "default", + UID: "uid-123", + CreationTimestamp: metav1.Now(), + }, + } + entry := &sandboxEntry{ + Kind: types.SandboxKind, + SessionID: "session-456", + OwnerID: tt.ownerID, + Ports: []runtimev1alpha1.TargetPort{ + {Port: 8080, Protocol: runtimev1alpha1.ProtocolTypeHTTP, PathPrefix: "/"}, + }, + IdleTimeout: 15 * time.Minute, + } + + info := buildSandboxInfo(sandbox, "10.0.0.1", entry) + assert.Equal(t, tt.ownerID, info.OwnerID) + }) + } +} diff --git a/pkg/workloadmanager/workload_builder.go b/pkg/workloadmanager/workload_builder.go index 7e97c7dc..f53917e1 100644 --- a/pkg/workloadmanager/workload_builder.go +++ b/pkg/workloadmanager/workload_builder.go @@ -133,6 +133,7 @@ type buildSandboxParams struct { workloadName string sandboxName string sessionID string + ownerID string ttl time.Duration idleTimeout time.Duration podSpec corev1.PodSpec @@ -145,6 +146,7 @@ type buildSandboxClaimParams struct { name string sandboxTemplateName string sessionID string + ownerID string idleTimeout time.Duration // ownerReference is the reference to the CodeInterpreter that creates this SandboxClaim ownerReference *metav1.OwnerReference @@ -203,6 +205,13 @@ func buildSandboxObject(params *buildSandboxParams) *sandboxv1alpha1.Sandbox { Replicas: ptr.To[int32](1), }, } + + // Ownership metadata for RLAC + if params.ownerID != "" { + sandbox.ObjectMeta.Annotations["agentcube.io/owner"] = params.ownerID + sandbox.ObjectMeta.Labels["agentcube.io/owner-hash"] = sha256Short(params.ownerID) + } + return sandbox } @@ -237,6 +246,13 @@ func buildSandboxClaimObject(params *buildSandboxClaimParams) *extensionsv1alpha if params.ownerReference != nil { sandboxClaim.ObjectMeta.OwnerReferences = []metav1.OwnerReference{*params.ownerReference} } + + // Ownership metadata for RLAC + if params.ownerID != "" { + sandboxClaim.ObjectMeta.Annotations["agentcube.io/owner"] = params.ownerID + sandboxClaim.ObjectMeta.Labels["agentcube.io/owner-hash"] = sha256Short(params.ownerID) + } + return sandboxClaim } diff --git a/pkg/workloadmanager/workload_builder_test.go b/pkg/workloadmanager/workload_builder_test.go index b7e44c38..0203e396 100644 --- a/pkg/workloadmanager/workload_builder_test.go +++ b/pkg/workloadmanager/workload_builder_test.go @@ -685,3 +685,105 @@ func TestBuildSandboxByCodeInterpreter_SuccessWithWarmPool(t *testing.T) { } assertOwnerReference(t, claim.OwnerReferences[0]) } + +func TestBuildSandboxObject_OwnershipLabels(t *testing.T) { + tests := []struct { + name string + ownerID string + wantLabel bool + wantAnnot bool + }{ + { + name: "ownerID set", + ownerID: "user-123", + wantLabel: true, + wantAnnot: true, + }, + { + name: "ownerID empty", + ownerID: "", + wantLabel: false, + wantAnnot: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + params := &buildSandboxParams{ + namespace: "default", + sandboxName: "sandbox-owner-test", + sessionID: "session-owner", + ownerID: tt.ownerID, + ttl: time.Hour, + idleTimeout: 15 * time.Minute, + } + sandbox := buildSandboxObject(params) + + _, hasAnnot := sandbox.ObjectMeta.Annotations["agentcube.io/owner"] + _, hasLabel := sandbox.ObjectMeta.Labels["agentcube.io/owner-hash"] + + if hasAnnot != tt.wantAnnot { + t.Errorf("expected annotation present=%v, got %v", tt.wantAnnot, hasAnnot) + } + if hasLabel != tt.wantLabel { + t.Errorf("expected label present=%v, got %v", tt.wantLabel, hasLabel) + } + + if tt.ownerID != "" { + if sandbox.ObjectMeta.Annotations["agentcube.io/owner"] != tt.ownerID { + t.Errorf("expected annotation value %q, got %q", tt.ownerID, sandbox.ObjectMeta.Annotations["agentcube.io/owner"]) + } + hash := sandbox.ObjectMeta.Labels["agentcube.io/owner-hash"] + if len(hash) != 63 { + t.Errorf("expected owner-hash length 63, got %d", len(hash)) + } + } + }) + } +} + +func TestBuildSandboxClaimObject_OwnershipLabels(t *testing.T) { + tests := []struct { + name string + ownerID string + wantLabel bool + wantAnnot bool + }{ + { + name: "ownerID set", + ownerID: "user-456", + wantLabel: true, + wantAnnot: true, + }, + { + name: "ownerID empty", + ownerID: "", + wantLabel: false, + wantAnnot: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + params := &buildSandboxClaimParams{ + namespace: "default", + name: "claim-owner-test", + sandboxTemplateName: "my-ci", + sessionID: "session-claim-owner", + ownerID: tt.ownerID, + idleTimeout: 10 * time.Minute, + } + claim := buildSandboxClaimObject(params) + + _, hasAnnot := claim.ObjectMeta.Annotations["agentcube.io/owner"] + _, hasLabel := claim.ObjectMeta.Labels["agentcube.io/owner-hash"] + + if hasAnnot != tt.wantAnnot { + t.Errorf("expected annotation present=%v, got %v", tt.wantAnnot, hasAnnot) + } + if hasLabel != tt.wantLabel { + t.Errorf("expected label present=%v, got %v", tt.wantLabel, hasLabel) + } + }) + } +} diff --git a/sdk-python/agentcube/__init__.py b/sdk-python/agentcube/__init__.py index 36c2b70c..496c8ef0 100644 --- a/sdk-python/agentcube/__init__.py +++ b/sdk-python/agentcube/__init__.py @@ -14,5 +14,12 @@ from .code_interpreter import CodeInterpreterClient from .agent_runtime import AgentRuntimeClient +from .auth import AuthProvider, TokenAuth, ServiceAccountAuth -__all__ = ["CodeInterpreterClient", "AgentRuntimeClient"] +__all__ = [ + "CodeInterpreterClient", + "AgentRuntimeClient", + "AuthProvider", + "TokenAuth", + "ServiceAccountAuth", +] diff --git a/sdk-python/agentcube/agent_runtime.py b/sdk-python/agentcube/agent_runtime.py index d5322178..9e6cf8b7 100644 --- a/sdk-python/agentcube/agent_runtime.py +++ b/sdk-python/agentcube/agent_runtime.py @@ -17,6 +17,7 @@ from typing import Any, Dict, Optional from requests.exceptions import JSONDecodeError +from agentcube.auth import AuthProvider from agentcube.clients.agent_runtime_data_plane import AgentRuntimeDataPlaneClient from agentcube.utils.log import get_logger @@ -31,6 +32,8 @@ def __init__( session_id: Optional[str] = None, timeout: int = 120, connect_timeout: float = 5.0, + auth_token: Optional[str] = None, + auth: Optional[AuthProvider] = None, ): self.agent_name = agent_name self.namespace = namespace @@ -40,6 +43,11 @@ def __init__( level = logging.DEBUG if verbose else logging.INFO self.logger = get_logger(__name__, level=level) + self._auth = auth + if not self._auth and auth_token: + from agentcube.auth import TokenAuth + self._auth = TokenAuth(auth_token) + router_url = router_url or os.getenv("ROUTER_URL") if not router_url: raise ValueError( @@ -55,6 +63,7 @@ def __init__( agent_name=self.agent_name, timeout=self.timeout, connect_timeout=self.connect_timeout, + auth=self._auth, ) if verbose: self.dp_client.logger.setLevel(logging.DEBUG) @@ -72,7 +81,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): self.close() - def invoke(self, payload: Dict[str, Any], timeout: Optional[float] = None) -> Any: + def invoke(self, payload: Dict[str, Any], timeout: Optional[float] = None, path: str = "") -> Any: if not self.session_id: raise ValueError("AgentRuntime session_id is not initialized") @@ -80,6 +89,7 @@ def invoke(self, payload: Dict[str, Any], timeout: Optional[float] = None) -> An session_id=self.session_id, payload=payload, timeout=timeout, + path=path, ) resp.raise_for_status() diff --git a/sdk-python/agentcube/auth.py b/sdk-python/agentcube/auth.py new file mode 100644 index 00000000..bb640bb1 --- /dev/null +++ b/sdk-python/agentcube/auth.py @@ -0,0 +1,94 @@ +# Copyright The Volcano Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import threading +import time +from typing import Optional, Protocol, Dict, runtime_checkable + +import requests + +from agentcube.utils.log import get_logger + +_REFRESH_BUFFER_SECONDS = 30 +_DEFAULT_TOKEN_TIMEOUT = (5.0, 30.0) # (connect, read) + +logger = get_logger(__name__) + + +@runtime_checkable +class AuthProvider(Protocol): + """Protocol for pluggable authentication providers.""" + + def get_token(self) -> str: ... + + +class TokenAuth: + """Wraps a static bearer token.""" + + def __init__(self, token: str): + if not token: + raise ValueError("Token must not be empty") + self._token = token + + def get_token(self) -> str: + return self._token + + +class ServiceAccountAuth: + """OAuth2 client_credentials grant with thread-safe token caching.""" + + def __init__( + self, + token_url: str, + client_id: str, + client_secret: str, + scope: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + timeout: tuple = _DEFAULT_TOKEN_TIMEOUT, + ): + self._token_url = token_url + self._client_id = client_id + self._client_secret = client_secret + self._scope = scope + self._headers = headers or {} + self._timeout = timeout + + self._lock = threading.Lock() + self._token: Optional[str] = None + self._expires_at: float = 0.0 + + def get_token(self) -> str: + with self._lock: + if self._token and time.monotonic() < self._expires_at: + return self._token + return self._fetch_token() + + def _fetch_token(self) -> str: + data = { + "grant_type": "client_credentials", + "client_id": self._client_id, + "client_secret": self._client_secret, + } + if self._scope: + data["scope"] = self._scope + + logger.debug(f"Fetching token from {self._token_url}") + resp = requests.post(self._token_url, data=data, headers=self._headers, timeout=self._timeout) + resp.raise_for_status() + + body = resp.json() + self._token = body["access_token"] + expires_in = int(body.get("expires_in", 3600)) + self._expires_at = time.monotonic() + expires_in - _REFRESH_BUFFER_SECONDS + return self._token or "" diff --git a/sdk-python/agentcube/clients/agent_runtime_data_plane.py b/sdk-python/agentcube/clients/agent_runtime_data_plane.py index 00eadec3..7156f6e4 100644 --- a/sdk-python/agentcube/clients/agent_runtime_data_plane.py +++ b/sdk-python/agentcube/clients/agent_runtime_data_plane.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Dict, Optional +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Dict, Optional from urllib.parse import urljoin import requests @@ -20,6 +22,9 @@ from agentcube.utils.http import create_session from agentcube.utils.log import get_logger +if TYPE_CHECKING: + from agentcube.auth import AuthProvider + class AgentRuntimeDataPlaneClient: SESSION_HEADER = "x-agentcube-session-id" @@ -33,12 +38,14 @@ def __init__( connect_timeout: float = 5.0, pool_connections: int = 10, pool_maxsize: int = 10, + auth: Optional["AuthProvider"] = None, ): self.router_url = router_url self.namespace = namespace self.agent_name = agent_name self.timeout = timeout self.connect_timeout = connect_timeout + self._auth = auth self.logger = get_logger(f"{__name__}.AgentRuntimeDataPlaneClient") base_path = ( @@ -51,9 +58,16 @@ def __init__( pool_maxsize=pool_maxsize, ) + def _auth_headers(self) -> Dict[str, str]: + """Return Authorization header dict if auth is available.""" + if not self._auth: + return {} + return {"Authorization": f"Bearer {self._auth.get_token()}"} + def bootstrap_session_id(self) -> str: resp = self.session.get( self.base_url, + headers=self._auth_headers(), timeout=(self.connect_timeout, self.timeout), ) session_id = resp.headers.get(self.SESSION_HEADER) @@ -79,16 +93,22 @@ def invoke( session_id: str, payload: Dict[str, Any], timeout: Optional[float] = None, + path: str = "", ) -> requests.Response: headers = { self.SESSION_HEADER: session_id, "Content-Type": "application/json", } + headers.update(self._auth_headers()) read_timeout = timeout if timeout is not None else self.timeout - self.logger.debug(f"POST {self.base_url}") + # Ensure path doesn't have leading slash so urljoin works correctly with base_url + invoke_path = path.lstrip("/") + url = urljoin(self.base_url, invoke_path) if invoke_path else self.base_url + + self.logger.debug(f"POST {url}") return self.session.post( - self.base_url, + url, json=payload, headers=headers, timeout=(self.connect_timeout, read_timeout), diff --git a/sdk-python/agentcube/clients/code_interpreter_data_plane.py b/sdk-python/agentcube/clients/code_interpreter_data_plane.py index 99f51d72..f4c1c0cd 100644 --- a/sdk-python/agentcube/clients/code_interpreter_data_plane.py +++ b/sdk-python/agentcube/clients/code_interpreter_data_plane.py @@ -12,13 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import base64 import json import time import os import ast import shlex -from typing import Optional, Any, Dict, List, Union +from typing import TYPE_CHECKING, Optional, Any, Dict, List, Union from urllib.parse import urljoin import requests @@ -27,6 +29,9 @@ from agentcube.utils.http import create_session from agentcube.exceptions import CommandExecutionError +if TYPE_CHECKING: + from agentcube.auth import AuthProvider + class CodeInterpreterDataPlaneClient: """Client for AgentCube Data Plane (Router -> PicoD). Handles command execution and file operations via the Router. @@ -46,6 +51,7 @@ def __init__( connect_timeout: float = 5.0, pool_connections: int = 10, pool_maxsize: int = 10, + auth: Optional["AuthProvider"] = None, ): """Initialize Data Plane client. @@ -65,6 +71,7 @@ def __init__( self.connect_timeout = connect_timeout self.pool_connections = pool_connections self.pool_maxsize = pool_maxsize + self._auth = auth self.logger = get_logger(f"{__name__}.CodeInterpreterDataPlaneClient") if base_url: @@ -98,6 +105,8 @@ def _request(self, method: str, endpoint: str, body: Optional[bytes] = None, **k url = urljoin(self.base_url, endpoint) headers = {} + if self._auth: + headers["Authorization"] = f"Bearer {self._auth.get_token()}" if body: headers["Content-Type"] = "application/json" @@ -235,6 +244,8 @@ def upload_file(self, local_path: str, remote_path: str) -> None: headers = { "x-agentcube-session-id": self.session_id } + if self._auth: + headers["Authorization"] = f"Bearer {self._auth.get_token()}" self.logger.debug(f"Uploading file {local_path} to {remote_path}") resp = self.session.post(url, files=files, data=data, headers=headers, timeout=self.timeout) diff --git a/sdk-python/agentcube/clients/control_plane.py b/sdk-python/agentcube/clients/control_plane.py index 48bee355..964dffd8 100644 --- a/sdk-python/agentcube/clients/control_plane.py +++ b/sdk-python/agentcube/clients/control_plane.py @@ -12,14 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import os +from typing import TYPE_CHECKING, Dict, Any, Optional + import requests -from typing import Dict, Any, Optional from agentcube.utils.log import get_logger from agentcube.utils.utils import read_token_from_file from agentcube.utils.http import create_session +if TYPE_CHECKING: + from agentcube.auth import AuthProvider + class ControlPlaneClient: """Client for AgentCube Control Plane (WorkloadManager). Handles creation and deletion of Code Interpreter sessions. @@ -33,6 +39,7 @@ def __init__( connect_timeout: float = 5.0, pool_connections: int = 10, pool_maxsize: int = 10, + auth: Optional["AuthProvider"] = None, ): """Initialize the Control Plane client. @@ -43,6 +50,7 @@ def __init__( connect_timeout: Connection timeout in seconds (default: 5). pool_connections: Number of connection pools to cache (default: 10). pool_maxsize: Maximum connections per pool (default: 10). + auth: Optional AuthProvider instance (takes priority over auth_token). """ # Prioritize argument -> env var self.base_url = workload_manager_url or os.getenv("WORKLOAD_MANAGER_URL") @@ -52,9 +60,21 @@ def __init__( "or 'WORKLOAD_MANAGER_URL' environment variable." ) - # Prioritize argument -> k8s service account token file - token_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" - token = auth_token or read_token_from_file(token_path) + # Resolve auth: auth param > auth_token > k8s SA token file + if auth: + self._auth = auth + elif auth_token: + from agentcube.auth import TokenAuth + self._auth = TokenAuth(auth_token) + else: + token_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" + token = read_token_from_file(token_path) + if token: + from agentcube.auth import TokenAuth + self._auth = TokenAuth(token) + else: + self._auth = None + self.timeout = timeout self.connect_timeout = connect_timeout @@ -70,8 +90,13 @@ def __init__( self.session.headers.update({ "Content-Type": "application/json", }) - if token: - self.session.headers["Authorization"] = f"Bearer {token}" + + def _apply_auth(self, request_kwargs: dict) -> None: + """Add Authorization header from auth provider if available.""" + if not self._auth: + return + headers = request_kwargs.setdefault("headers", {}) + headers["Authorization"] = f"Bearer {self._auth.get_token()}" def create_session( self, @@ -102,11 +127,9 @@ def create_session( self.logger.debug(f"Creating session at {url} with payload: {payload}") try: - response = self.session.post( - url, - json=payload, - timeout=(self.connect_timeout, self.timeout) - ) + kwargs = {"json": payload, "timeout": (self.connect_timeout, self.timeout)} + self._apply_auth(kwargs) + response = self.session.post(url, **kwargs) response.raise_for_status() data = response.json() @@ -134,10 +157,9 @@ def delete_session(self, session_id: str) -> bool: self.logger.debug(f"Deleting session {session_id} at {url}") try: - response = self.session.delete( - url, - timeout=(self.connect_timeout, self.timeout) - ) + kwargs: Dict[str, Any] = {"timeout": (self.connect_timeout, self.timeout)} + self._apply_auth(kwargs) + response = self.session.delete(url, **kwargs) if response.status_code == 404: return True # Already gone response.raise_for_status() diff --git a/sdk-python/agentcube/code_interpreter.py b/sdk-python/agentcube/code_interpreter.py index b0ccf5df..ca560627 100644 --- a/sdk-python/agentcube/code_interpreter.py +++ b/sdk-python/agentcube/code_interpreter.py @@ -12,14 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import os import logging -from typing import Any, Optional +from typing import TYPE_CHECKING, Any, Optional from agentcube.clients.control_plane import ControlPlaneClient from agentcube.clients.code_interpreter_data_plane import CodeInterpreterDataPlaneClient from agentcube.utils.log import get_logger +if TYPE_CHECKING: + from agentcube.auth import AuthProvider + class CodeInterpreterClient: """ @@ -58,6 +63,7 @@ def __init__( auth_token: Optional[str] = None, verbose: bool = False, session_id: Optional[str] = None, + auth: Optional["AuthProvider"] = None, ): """ Initialize the Code Interpreter Client. @@ -84,8 +90,17 @@ def __init__( level = logging.DEBUG if verbose else logging.INFO self.logger = get_logger(__name__, level=level) + # Resolve auth: auth param > auth_token > None + if auth: + self._auth = auth + elif auth_token: + from agentcube.auth import TokenAuth + self._auth = TokenAuth(auth_token) + else: + self._auth = None + # Initialize Control Plane client - self.cp_client = ControlPlaneClient(workload_manager_url, auth_token) + self.cp_client = ControlPlaneClient(workload_manager_url, auth=self._auth) if verbose: self.cp_client.logger.setLevel(logging.DEBUG) @@ -133,6 +148,7 @@ def _init_data_plane(self): router_url=self.router_url, namespace=self.namespace, session_id=self.session_id, + auth=self._auth, ) if self.verbose: self.dp_client.logger.setLevel(logging.DEBUG) diff --git a/sdk-python/tests/test_auth.py b/sdk-python/tests/test_auth.py new file mode 100644 index 00000000..23a9f42a --- /dev/null +++ b/sdk-python/tests/test_auth.py @@ -0,0 +1,144 @@ +# Copyright The Volcano Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from concurrent.futures import ThreadPoolExecutor +from unittest.mock import Mock, patch + +from agentcube.auth import AuthProvider, ServiceAccountAuth, TokenAuth + + +class TestTokenAuth(unittest.TestCase): + + def test_token_auth_get_token(self): + auth = TokenAuth("my-secret-token") + self.assertEqual(auth.get_token(), "my-secret-token") + + def test_token_auth_empty_raises(self): + with self.assertRaises(ValueError): + TokenAuth("") + + +class TestServiceAccountAuth(unittest.TestCase): + + def _mock_response(self, access_token="tok-123", expires_in=3600, status_code=200): + resp = Mock() + resp.status_code = status_code + resp.json.return_value = { + "access_token": access_token, + "expires_in": expires_in, + } + resp.raise_for_status = Mock() + return resp + + @patch("agentcube.auth.requests.post") + def test_service_account_auth_initial_fetch(self, mock_post): + mock_post.return_value = self._mock_response() + + auth = ServiceAccountAuth( + token_url="https://idp.example.com/token", + client_id="cid", + client_secret="csecret", + ) + token = auth.get_token() + + self.assertEqual(token, "tok-123") + mock_post.assert_called_once() + + @patch("agentcube.auth.requests.post") + def test_service_account_auth_caches_token(self, mock_post): + mock_post.return_value = self._mock_response() + + auth = ServiceAccountAuth( + token_url="https://idp.example.com/token", + client_id="cid", + client_secret="csecret", + ) + auth.get_token() + auth.get_token() + + # Only one HTTP call despite two get_token() calls + mock_post.assert_called_once() + + @patch("agentcube.auth.time.monotonic") + @patch("agentcube.auth.requests.post") + def test_service_account_auth_refreshes_expired(self, mock_post, mock_monotonic): + mock_post.return_value = self._mock_response(expires_in=60) + # First call at t=0, cached until t=30 (60 - 30s buffer) + mock_monotonic.return_value = 0.0 + + auth = ServiceAccountAuth( + token_url="https://idp.example.com/token", + client_id="cid", + client_secret="csecret", + ) + auth.get_token() + self.assertEqual(mock_post.call_count, 1) + + # Advance past expiry (beyond the 30s buffer) + mock_monotonic.return_value = 31.0 + mock_post.return_value = self._mock_response(access_token="tok-456") + + token = auth.get_token() + self.assertEqual(token, "tok-456") + self.assertEqual(mock_post.call_count, 2) + + @patch("agentcube.auth.requests.post") + def test_service_account_auth_thread_safety(self, mock_post): + mock_post.return_value = self._mock_response() + + auth = ServiceAccountAuth( + token_url="https://idp.example.com/token", + client_id="cid", + client_secret="csecret", + ) + + with ThreadPoolExecutor(max_workers=8) as pool: + results = list(pool.map(lambda _: auth.get_token(), range(50))) + + # All threads should get the same cached token + self.assertTrue(all(t == "tok-123" for t in results)) + + @patch("agentcube.auth.requests.post") + def test_service_account_auth_http_error(self, mock_post): + from requests.exceptions import HTTPError + resp = Mock() + resp.status_code = 401 + resp.raise_for_status.side_effect = HTTPError(response=resp) + mock_post.return_value = resp + + auth = ServiceAccountAuth( + token_url="https://idp.example.com/token", + client_id="cid", + client_secret="csecret", + ) + with self.assertRaises(HTTPError): + auth.get_token() + + +class TestAuthProviderProtocol(unittest.TestCase): + + def test_auth_provider_protocol(self): + token_auth = TokenAuth("abc") + sa_auth = ServiceAccountAuth( + token_url="https://idp.example.com/token", + client_id="cid", + client_secret="csecret", + ) + self.assertIsInstance(token_auth, AuthProvider) + self.assertIsInstance(sa_auth, AuthProvider) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index daba8c09..3fb24a0b 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -1560,3 +1560,90 @@ func TestCodeInterpreterBasicInvocationLoad(t *testing.T) { runCodeInterpreterLoadTest(t, env, namespace, name, requestsPerSecond, testDuration, "Load test request %d!") } + +// skipIfNoOIDC skips the test when OIDC is not enabled in the E2E environment. +func skipIfNoOIDC(t *testing.T) { + t.Helper() + if os.Getenv("OIDC_ENABLED") != "true" { + t.Skip("skipping OIDC test: OIDC_ENABLED not set (Keycloak not deployed)") + } +} + +// rawInvoke sends an HTTP request to the Router and returns the raw status code and body. +func rawInvoke(routerURL, namespace, name, sessionID, token string) (int, string, error) { + reqURL := fmt.Sprintf("%s/v1/namespaces/%s/agent-runtimes/%s/invocations/echo", + routerURL, namespace, name) + + body := `{"input":"oidc-test"}` + httpReq, err := http.NewRequest("POST", reqURL, strings.NewReader(body)) + if err != nil { + return 0, "", err + } + httpReq.Header.Set("Content-Type", "application/json") + if token != "" { + httpReq.Header.Set("Authorization", "Bearer "+token) + } + if sessionID != "" { + httpReq.Header.Set("x-agentcube-session-id", sessionID) + } + + client := &http.Client{Timeout: 60 * time.Second} + resp, err := client.Do(httpReq) + if err != nil { + return 0, "", err + } + defer resp.Body.Close() + + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return resp.StatusCode, "", err + } + return resp.StatusCode, string(respBody), nil +} + +func TestOIDCAuthNoToken(t *testing.T) { + skipIfNoOIDC(t) + env := newTestEnv(t) + + status, _, err := rawInvoke(env.routerURL, agentcubeNamespace, "echo-agent", "", "") + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, status) +} + +func TestOIDCAuthInvalidToken(t *testing.T) { + skipIfNoOIDC(t) + env := newTestEnv(t) + + status, _, err := rawInvoke(env.routerURL, agentcubeNamespace, "echo-agent", "", "invalid.garbage.token") + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, status) +} + +func TestRLACOwnershipEnforcement(t *testing.T) { + skipIfNoOIDC(t) + env := newTestEnv(t) + + // Step 1: Create a sandbox with the valid OIDC token (User A) + req := &AgentInvokeRequest{Input: "RLAC ownership test"} + _, sessionID, err := env.invokeAgentRuntime(agentcubeNamespace, "echo-agent", "", req) + require.NoError(t, err) + require.NotEmpty(t, sessionID) + + // Step 2: Try to access the same sandbox with a different identity (admin client) + adminToken := os.Getenv("ADMIN_TOKEN") + if adminToken == "" { + t.Skip("skipping RLAC cross-user test: ADMIN_TOKEN not set") + } + + status, body, err := rawInvoke(env.routerURL, agentcubeNamespace, "echo-agent", sessionID, adminToken) + require.NoError(t, err) + + // Admin role should bypass RLAC; if the admin client has admin role, expect 200. + // If the admin client does NOT have admin role, expect 403. + if status == http.StatusForbidden { + require.Contains(t, body, "you do not own this sandbox") + } else { + // Admin bypass succeeded, which is also valid + require.Equal(t, http.StatusOK, status) + } +} diff --git a/test/e2e/run_e2e.sh b/test/e2e/run_e2e.sh index 0c215718..3298f476 100755 --- a/test/e2e/run_e2e.sh +++ b/test/e2e/run_e2e.sh @@ -22,6 +22,8 @@ AGENTCUBE_NAMESPACE=${AGENTCUBE_NAMESPACE:-agentcube-system} WORKLOAD_NAMESPACE=${WORKLOAD_NAMESPACE:-agentcube} E2E_VENV_DIR=${E2E_VENV_DIR:-/tmp/agentcube-e2e-venv} MCP_K8S_LOCAL_PORT=${MCP_K8S_LOCAL_PORT:-19446} +KEYCLOAK_ENABLED=${KEYCLOAK_ENABLED:-false} +KEYCLOAK_IMAGE=${KEYCLOAK_IMAGE:-quay.io/keycloak/keycloak:26.0} _SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" REPO_ROOT="$(cd "$_SCRIPT_DIR/../.." && pwd)" @@ -58,6 +60,10 @@ cleanup() { echo "Stopping MCP in-cluster port forward (PID: $MCP_K8S_PF_PID)..." kill "$MCP_K8S_PF_PID" 2>/dev/null || true fi + if [ -n "${KEYCLOAK_PID:-}" ]; then + echo "Stopping Keycloak port forward (PID: $KEYCLOAK_PID)..." + kill "$KEYCLOAK_PID" 2>/dev/null || true + fi # Best-effort: remove MCP Deployment so the next run starts clean kubectl delete deployment agentcube-code-interpreter-mcp -n "${AGENTCUBE_NAMESPACE:-agentcube}" --ignore-not-found=true 2>/dev/null || true @@ -410,6 +416,41 @@ run_setup() { kubectl get agentruntime echo-agent-short-ttl -n "${WORKLOAD_NAMESPACE}" -o jsonpath='{.metadata.name}{"\n"}' || echo "echo-agent-short-ttl may still be starting..." echo "AgentRuntimes created, waiting for pods to be ready..." sleep 10 + + # Deploy Keycloak when enabled + if [ "${KEYCLOAK_ENABLED}" = "true" ]; then + step "Deploying Keycloak addon..." + docker_pull_if_missing "${KEYCLOAK_IMAGE}" + kind_load_image "${KEYCLOAK_IMAGE}" + + helm upgrade --install keycloak manifests/charts/addons/keycloak \ + --namespace "${AGENTCUBE_NAMESPACE}" \ + --set adminUser=admin --set adminPassword=admin \ + --set clients.service.secret=e2e-service-secret \ + --set clients.router.secret=e2e-router-secret \ + --set clients.admin.secret=e2e-admin-secret \ + --wait --timeout=5m + + step "Waiting for Keycloak to be ready..." + kubectl -n "${AGENTCUBE_NAMESPACE}" rollout status deployment/keycloak --timeout=300s + + # Configure OIDC Helm args for Router + OIDC_HELM_ARGS=( + --set "router.oidc.issuerUrl=http://keycloak.${AGENTCUBE_NAMESPACE}.svc.cluster.local:8080/realms/agentcube" + --set "router.oidc.rolesClaim=realm_access.roles" + --set "router.oidc.requiredRole=sandbox:invoke" + ) + + # Reconfigure Router with OIDC flags + step "Reconfiguring Router with OIDC flags..." + helm upgrade agentcube manifests/charts/base \ + --namespace "${AGENTCUBE_NAMESPACE}" \ + --reuse-values \ + "${OIDC_HELM_ARGS[@]}" \ + --wait --timeout=5m + + kubectl -n "${AGENTCUBE_NAMESPACE}" rollout status deployment/agentcube-router --timeout=300s + fi } echo "Starting E2E tests..." @@ -441,6 +482,35 @@ step "Running tests..." API_TOKEN=$(kubectl create token e2e-test -n "${AGENTCUBE_NAMESPACE}" --duration=24h) echo "Token created" +# Obtain Keycloak tokens when OIDC is enabled +if [ "${KEYCLOAK_ENABLED}" = "true" ]; then + step "Obtaining Keycloak access tokens..." + kubectl port-forward svc/keycloak -n "${AGENTCUBE_NAMESPACE}" 8082:8080 > /tmp/keycloak_port_forward.log 2>&1 & + KEYCLOAK_PID=$! + sleep 3 + + KEYCLOAK_TOKEN=$(curl -s -X POST \ + -H "Host: keycloak.${AGENTCUBE_NAMESPACE}.svc.cluster.local:8080" \ + "http://localhost:8082/realms/agentcube/protocol/openid-connect/token" \ + -d "grant_type=client_credentials" \ + -d "client_id=agentcube-service" \ + -d "client_secret=e2e-service-secret" | jq -r '.access_token') + + ADMIN_TOKEN=$(curl -s -X POST \ + -H "Host: keycloak.${AGENTCUBE_NAMESPACE}.svc.cluster.local:8080" \ + "http://localhost:8082/realms/agentcube/protocol/openid-connect/token" \ + -d "grant_type=client_credentials" \ + -d "client_id=agentcube-admin" \ + -d "client_secret=e2e-admin-secret" | jq -r '.access_token') + + # Override the K8s SA token with the Keycloak token + export API_TOKEN="${KEYCLOAK_TOKEN}" + export ADMIN_TOKEN="${ADMIN_TOKEN}" + export OIDC_ENABLED="true" + export KEYCLOAK_TOKEN_URL="http://localhost:8082/realms/agentcube/protocol/openid-connect/token" + echo "Keycloak tokens acquired" +fi + # Port forward workload manager in background echo "Starting workload manager port-forward..." kubectl port-forward svc/workloadmanager -n "${AGENTCUBE_NAMESPACE}" "${WORKLOAD_MANAGER_LOCAL_PORT}:8080" > /tmp/workload_port_forward.log 2>&1 & @@ -519,6 +589,8 @@ if ! WORKLOAD_MANAGER_URL="http://localhost:${WORKLOAD_MANAGER_LOCAL_PORT}" \ ROUTER_URL="http://localhost:${ROUTER_LOCAL_PORT}" \ MTLS_ENABLED="${MTLS_ENABLED}" \ WORKLOAD_NAMESPACE="${WORKLOAD_NAMESPACE}" \ + OIDC_ENABLED="${OIDC_ENABLED:-false}" \ + ADMIN_TOKEN="${ADMIN_TOKEN:-}" \ API_TOKEN=$API_TOKEN \ go test -v ./test/e2e/...; then TEST_FAILED=1 @@ -530,12 +602,28 @@ cd "$(dirname "$0")" if ! WORKLOAD_MANAGER_URL="http://localhost:${WORKLOAD_MANAGER_LOCAL_PORT}" \ ROUTER_URL="http://localhost:${ROUTER_LOCAL_PORT}" \ MTLS_ENABLED="${MTLS_ENABLED}" \ + OIDC_ENABLED="${OIDC_ENABLED:-false}" \ + KEYCLOAK_TOKEN_URL="${KEYCLOAK_TOKEN_URL:-}" \ API_TOKEN=$API_TOKEN \ AGENTCUBE_NAMESPACE="${WORKLOAD_NAMESPACE}" \ "$E2E_VENV_DIR/bin/python" test_codeinterpreter.py; then TEST_FAILED=1 fi +if [ "${KEYCLOAK_ENABLED}" = "true" ]; then + echo "Running Python OIDC auth tests..." + if ! WORKLOAD_MANAGER_URL="http://localhost:${WORKLOAD_MANAGER_LOCAL_PORT}" \ + ROUTER_URL="http://localhost:${ROUTER_LOCAL_PORT}" \ + OIDC_ENABLED="true" \ + KEYCLOAK_TOKEN_URL="${KEYCLOAK_TOKEN_URL}" \ + AGENTCUBE_SYSTEM_NAMESPACE="${AGENTCUBE_NAMESPACE}" \ + API_TOKEN=$API_TOKEN \ + AGENTCUBE_NAMESPACE="${WORKLOAD_NAMESPACE}" \ + "$E2E_VENV_DIR/bin/python" test_oidc_auth.py; then + TEST_FAILED=1 + fi +fi + echo "Running LangChain AgentcubeSandbox E2E..." if ! WORKLOAD_MANAGER_URL="http://localhost:${WORKLOAD_MANAGER_LOCAL_PORT}" ROUTER_URL="http://localhost:${ROUTER_LOCAL_PORT}" MTLS_ENABLED="${MTLS_ENABLED}" API_TOKEN=$API_TOKEN AGENTCUBE_NAMESPACE="${AGENTCUBE_NAMESPACE}" "$E2E_VENV_DIR/bin/python" test_langchain_agentcube_sandbox.py; then TEST_FAILED=1 diff --git a/test/e2e/test_oidc_auth.py b/test/e2e/test_oidc_auth.py new file mode 100644 index 00000000..e33e5640 --- /dev/null +++ b/test/e2e/test_oidc_auth.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# Copyright The Volcano Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""E2E tests for OIDC authentication using the Python SDK.""" + +import os +import sys +import unittest + + +def skip_if_no_oidc(): + """Skip test if OIDC is not enabled.""" + if os.getenv("OIDC_ENABLED") != "true": + raise unittest.SkipTest("OIDC_ENABLED not set (Keycloak not deployed)") + + +class TestOIDCSDKAuth(unittest.TestCase): + """Tests that the Python SDK works with ServiceAccountAuth.""" + + def test_service_account_auth_agent_runtime(self): + """Full flow: ServiceAccountAuth -> AgentRuntime session -> execute code.""" + skip_if_no_oidc() + + from agentcube import AgentRuntimeClient, ServiceAccountAuth + + keycloak_url = os.getenv( + "KEYCLOAK_TOKEN_URL", + "http://localhost:8082/realms/agentcube/protocol/openid-connect/token", + ) + + system_namespace = os.getenv("AGENTCUBE_SYSTEM_NAMESPACE", "agentcube-system") + auth = ServiceAccountAuth( + token_url=keycloak_url, + client_id="agentcube-service", + client_secret="e2e-service-secret", + headers={"Host": f"keycloak.{system_namespace}.svc.cluster.local:8080"} + ) + + namespace = os.getenv("AGENTCUBE_NAMESPACE", "agentcube") + router_url = os.getenv("ROUTER_URL", "http://localhost:8081") + + with AgentRuntimeClient( + agent_name="echo-agent", + namespace=namespace, + router_url=router_url, + auth=auth, + ) as client: + result = client.invoke({"input": "Hello OIDC"}, path="echo") + output = result.get("output", "") if isinstance(result, dict) else str(result) + self.assertIn("Hello OIDC", output) + + +if __name__ == "__main__": + result = unittest.main(verbosity=2, exit=False) + sys.exit(0 if result.result.wasSuccessful() else 1)