From ad6b74d4e9b47d817cf1f142d6e9565901abf764 Mon Sep 17 00:00:00 2001 From: Ryan Li Date: Tue, 5 May 2026 09:59:00 -0700 Subject: [PATCH 1/9] Add TLS support --- deployments/charts/service/README.md | 21 +- .../templates/_gateway-envoy-config.tpl | 42 ++-- .../service/templates/_gateway-helpers.tpl | 56 +++++ .../service/templates/agent-service.yaml | 8 +- .../charts/service/templates/api-service.yaml | 13 +- .../charts/service/templates/gateway-tls.yaml | 196 ++++++++++++++++++ .../service/templates/logger-service.yaml | 8 +- .../service/templates/router-service.yaml | 11 +- deployments/charts/service/values.yaml | 53 ++++- src/service/agent/agent_service.py | 6 +- src/service/core/service.py | 2 +- src/service/core/workflow/objects.py | 1 + src/service/logger/logger.py | 6 +- src/service/router/router.py | 5 +- src/utils/static_config.py | 25 +++ 15 files changed, 417 insertions(+), 36 deletions(-) create mode 100644 deployments/charts/service/templates/gateway-tls.yaml diff --git a/deployments/charts/service/README.md b/deployments/charts/service/README.md index 6fa51f6fa..abbb62c3b 100644 --- a/deployments/charts/service/README.md +++ b/deployments/charts/service/README.md @@ -403,11 +403,28 @@ Envoy uses filesystem-based dynamic configuration (LDS/CDS). When the ConfigMap | `gateway.networkPolicies.enabled` | Deploy NetworkPolicies restricting ingress to upstream pods | `false` | | `gateway.networkPolicies.upstreams` | List of upstream pods to protect (name, podSelector, port) | See values.yaml | -#### TLS +#### Gateway → Upstream TLS + +When `gateway.tls.enabled` is `true`, traffic between the Envoy gateway and the upstream services (`osmo-service`, `osmo-router`, `osmo-agent`, `osmo-logger`) is encrypted end-to-end. The Envoy verifies upstream certs against a CA, and each upstream serves HTTPS via uvicorn's `--ssl_keyfile` / `--ssl_certfile`. The UI intentionally stays on plain HTTP and relies on `gateway.networkPolicies` for ingress restriction (Next.js does not natively serve TLS). + +Cert material can be provisioned in two ways: + +- **A1 (default)** — the chart self-signs a CA (`-ca-tls`) and per-service leaf certs (`osmo-service-tls`, `osmo-router-tls`, `osmo-agent-tls`, `osmo-logger-tls`) using Sprig's `genCA` / `genSignedCert`. The `lookup` function reuses existing secrets across `helm upgrade` so certs aren't rotated on every release. Zero external dependencies. Default validity is 10 years (CA) and 5 years (leaf). +- **A2 (`gateway.tls.certManager.enabled: true`)** — the chart emits cert-manager `Issuer` + `Certificate` resources. By default it creates a self-signed root + a CA Issuer + per-service Certificates. To plug in an existing CA (Vault, internal PKI, ACME), set `gateway.tls.certManager.issuerRef`. Requires cert-manager installed in the cluster. | Parameter | Description | Default | |-----------|-------------|---------| -| `gateway.tls.enabled` | Generate self-signed certs for upstream TLS | `false` | +| `gateway.tls.enabled` | Encrypt gateway → upstream traffic. Generates certs (mode chosen below), wires upstream Deployments to serve HTTPS, and adds `UpstreamTlsContext` + `sni:` to each Envoy cluster. | `false` | +| `gateway.tls.caDuration` | CA cert validity (cert-manager mode) | `87600h` (10y) | +| `gateway.tls.caRenewBefore` | Renew CA this long before expiry (cert-manager mode) | `720h` (30d) | +| `gateway.tls.certDuration` | Leaf cert validity (cert-manager mode) | `43800h` (5y) | +| `gateway.tls.certRenewBefore` | Renew leaf this long before expiry (cert-manager mode) | `360h` (15d) | +| `gateway.tls.certManager.enabled` | Use cert-manager to manage cert lifecycle (A2). Requires cert-manager. | `false` | +| `gateway.tls.certManager.issuerRef` | Optional: point at an existing Issuer/ClusterIssuer. Map with `name`, `kind` (`Issuer` or `ClusterIssuer`), and `group` (defaults to `cert-manager.io`). When empty, the chart creates a self-signed Issuer + CA chain. | `{}` | + +The Helm-mode (A1) Secrets are annotated with `helm.sh/resource-policy: keep` so a `helm uninstall` won't shred the CA — protects against accidental rotation. + +NetworkPolicy and TLS are independent: NetworkPolicy controls *who* can connect at L3/L4; TLS encrypts the bytes at L7. Run them together for defense in depth, or either alone. ### Extensibility diff --git a/deployments/charts/service/templates/_gateway-envoy-config.tpl b/deployments/charts/service/templates/_gateway-envoy-config.tpl index ce7e43e5e..896a44d1a 100644 --- a/deployments/charts/service/templates/_gateway-envoy-config.tpl +++ b/deployments/charts/service/templates/_gateway-envoy-config.tpl @@ -578,6 +578,7 @@ data: name: envoy.transport_sockets.tls typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext + sni: {{ $gw.upstreams.service.host }} common_tls_context: validation_context_sds_secret_config: name: upstream_ca @@ -611,6 +612,7 @@ data: name: envoy.transport_sockets.tls typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext + sni: {{ $gw.upstreams.router.host }} common_tls_context: validation_context_sds_secret_config: name: upstream_ca @@ -638,20 +640,12 @@ data: socket_address: address: {{ $gw.upstreams.ui.host }} port_value: {{ $gw.upstreams.ui.port }} - {{- if $gw.tls.enabled }} - transport_socket: - name: envoy.transport_sockets.tls - typed_config: - "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext - common_tls_context: - validation_context_sds_secret_config: - name: upstream_ca - sds_config: - path_config_source: - path: /var/config/sds_upstream_ca.yaml - watched_directory: - path: /var/config - {{- end }} + {{/* + UI traffic stays HTTP — Next.js does not natively serve HTTPS and + the UI sits behind NetworkPolicy. Confidentiality of the UI HTML + relies on browser → gateway TLS (gateway.envoy.ssl.enabled), not on + Envoy → upstream TLS. + */}} {{- end }} {{- if $gw.upstreams.agent.enabled }} @@ -675,6 +669,7 @@ data: name: envoy.transport_sockets.tls typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext + sni: {{ $gw.upstreams.agent.host }} common_tls_context: validation_context_sds_secret_config: name: upstream_ca @@ -707,6 +702,7 @@ data: name: envoy.transport_sockets.tls typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext + sni: {{ $gw.upstreams.logger.host }} common_tls_context: validation_context_sds_secret_config: name: upstream_ca @@ -805,6 +801,7 @@ data: {{- end }} {{- if $envoy.internalJwks.enabled }} + {{- $jwksHost := $envoy.internalJwks.host | default $gw.upstreams.service.host }} - "@type": type.googleapis.com/envoy.config.cluster.v3.Cluster name: {{ $envoy.internalJwks.cluster }} connect_timeout: 3s @@ -818,8 +815,23 @@ data: - endpoint: address: socket_address: - address: {{ $envoy.internalJwks.host | default $gw.upstreams.service.host }} + address: {{ $jwksHost }} port_value: {{ $envoy.internalJwks.port | default $gw.upstreams.service.port }} + {{- if $gw.tls.enabled }} + transport_socket: + name: envoy.transport_sockets.tls + typed_config: + "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext + sni: {{ $jwksHost }} + common_tls_context: + validation_context_sds_secret_config: + name: upstream_ca + sds_config: + path_config_source: + path: /var/config/sds_upstream_ca.yaml + watched_directory: + path: /var/config + {{- end }} {{- end }} {{- end }} diff --git a/deployments/charts/service/templates/_gateway-helpers.tpl b/deployments/charts/service/templates/_gateway-helpers.tpl index e47765cc7..a64295e36 100644 --- a/deployments/charts/service/templates/_gateway-helpers.tpl +++ b/deployments/charts/service/templates/_gateway-helpers.tpl @@ -30,3 +30,59 @@ app.kubernetes.io/name: {{ include "osmo.gateway-name" .context }} app.kubernetes.io/instance: {{ .context.Release.Name }} app.kubernetes.io/component: {{ .component }} {{- end }} + +{{/* +Per-upstream TLS args. Used by api-service, agent-service, logger-service, +router-service when gateway.tls.enabled is true. Pass a dict with "context". + +Outputs --ssl_keyfile and --ssl_certfile, indented to fit container args lists. +The mount path is fixed at /etc/osmo/tls; the cert is provided by the +{name}-tls Secret created by gateway-tls.yaml. +*/}} +{{- define "osmo.upstream-tls-args" -}} +{{- if .context.Values.gateway.tls.enabled }} +- --ssl_keyfile +- /etc/osmo/tls/tls.key +- --ssl_certfile +- /etc/osmo/tls/tls.crt +{{- end }} +{{- end }} + +{{/* +TLS volume mount for an upstream container. Use under volumeMounts. +*/}} +{{- define "osmo.upstream-tls-volume-mount" -}} +{{- if .context.Values.gateway.tls.enabled }} +- name: tls + mountPath: /etc/osmo/tls + readOnly: true +{{- end }} +{{- end }} + +{{/* +TLS volume for an upstream pod. Pass dict with "context" and "secret" (the +per-service Secret name, e.g. "osmo-service-tls"). Use under volumes. +*/}} +{{- define "osmo.upstream-tls-volume" -}} +{{- if .context.Values.gateway.tls.enabled }} +- name: tls + secret: + secretName: {{ .secret }} +{{- end }} +{{- end }} + +{{/* +Render a probe block, injecting `scheme: HTTPS` into httpGet when TLS is on. +Pass dict with "probe" (the probe value from Values) and "context" ($). + +Use: + livenessProbe: + {{- include "osmo.upstream-probe-yaml" (dict "probe" .Values.services.service.livenessProbe "context" .) | nindent 10 }} +*/}} +{{- define "osmo.upstream-probe-yaml" -}} +{{- $probe := .probe }} +{{- if and $probe .context.Values.gateway.tls.enabled (hasKey $probe "httpGet") }} + {{- $probe = mustMergeOverwrite (deepCopy $probe) (dict "httpGet" (dict "scheme" "HTTPS")) }} +{{- end }} +{{- toYaml $probe }} +{{- end }} diff --git a/deployments/charts/service/templates/agent-service.yaml b/deployments/charts/service/templates/agent-service.yaml index b76784f82..080a5f727 100644 --- a/deployments/charts/service/templates/agent-service.yaml +++ b/deployments/charts/service/templates/agent-service.yaml @@ -130,6 +130,7 @@ spec: {{- range $arg := .Values.services.agent.extraArgs }} - {{ $arg | quote }} {{- end }} + {{- include "osmo.upstream-tls-args" (dict "context" .) | nindent 8 }} env: {{- if .Values.services.migration.enabled }} - name: OSMO_SCHEMA_VERSION @@ -154,7 +155,7 @@ spec: {{- end }} imagePullPolicy: {{ .Values.services.agent.imagePullPolicy }} ports: - {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.agent.extraVolumeMounts }} + {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.agent.extraVolumeMounts .Values.gateway.tls.enabled }} volumeMounts: {{- end }} {{- if .Values.services.configFile.enabled}} @@ -168,6 +169,7 @@ spec: mountPath: /logs {{- end }} {{- include "osmo.extra-volume-mounts" .Values.services.agent | nindent 8 }} + {{- include "osmo.upstream-tls-volume-mount" (dict "context" .) | nindent 8 }} resources: {{- toYaml .Values.services.agent.resources | nindent 10 }} @@ -202,6 +204,9 @@ spec: httpGet: port: 8000 path: /health + {{- if .Values.gateway.tls.enabled }} + scheme: HTTPS + {{- end }} periodSeconds: 45 failureThreshold: 3 timeoutSeconds: 20 @@ -210,6 +215,7 @@ spec: {{- include "osmo.extra-sidecars" .Values.services.agent | nindent 6 }} volumes: {{- include "osmo.extra-volumes" .Values.services.agent | nindent 8 }} + {{- include "osmo.upstream-tls-volume" (dict "context" . "secret" "osmo-agent-tls") | nindent 8 }} {{- if .Values.global.logs.enabled }} - name: logs emptyDir: {} diff --git a/deployments/charts/service/templates/api-service.yaml b/deployments/charts/service/templates/api-service.yaml index 67a75f145..a7209be0b 100644 --- a/deployments/charts/service/templates/api-service.yaml +++ b/deployments/charts/service/templates/api-service.yaml @@ -152,6 +152,7 @@ spec: {{- range $arg := .Values.services.service.extraArgs }} - {{ $arg | quote }} {{- end }} + {{- include "osmo.upstream-tls-args" (dict "context" .) | nindent 8 }} env: - name: OSMO_DISABLE_TASK_METRICS value: {{ .Values.services.service.disableTaskMetrics | quote }} @@ -193,7 +194,7 @@ spec: ports: - name: metrics containerPort: 9464 - {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.service.extraVolumeMounts }} + {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.service.extraVolumeMounts .Values.gateway.tls.enabled }} volumeMounts: {{- end }} {{- if .Values.services.configFile.enabled}} @@ -207,12 +208,13 @@ spec: mountPath: /logs {{- end }} {{- include "osmo.extra-volume-mounts" .Values.services.service | nindent 8 }} + {{- include "osmo.upstream-tls-volume-mount" (dict "context" .) | nindent 8 }} resources: {{- toYaml .Values.services.service.resources | nindent 10 }} # Any failure to return the version api means the service is in a bad state livenessProbe: - {{- toYaml .Values.services.service.livenessProbe | nindent 10 }} + {{- include "osmo.upstream-probe-yaml" (dict "probe" .Values.services.service.livenessProbe "context" .) | nindent 10 }} # Give the container 30 seconds to startup @@ -220,6 +222,9 @@ spec: httpGet: port: 8000 path: /api/version + {{- if .Values.gateway.tls.enabled }} + scheme: HTTPS + {{- end }} failureThreshold: 6 periodSeconds: 5 timeoutSeconds: 3 @@ -230,6 +235,9 @@ spec: httpGet: port: 8000 path: /api/workflow?limit=0&all_pools=true + {{- if .Values.gateway.tls.enabled }} + scheme: HTTPS + {{- end }} httpHeaders: - name: x-osmo-roles value: osmo-admin @@ -240,6 +248,7 @@ spec: {{- include "osmo.extra-sidecars" .Values.services.service | nindent 6 }} volumes: {{- include "osmo.extra-volumes" .Values.services.service | nindent 8 }} + {{- include "osmo.upstream-tls-volume" (dict "context" . "secret" "osmo-service-tls") | nindent 8 }} {{- if .Values.global.logs.enabled }} - name: logs emptyDir: {} diff --git a/deployments/charts/service/templates/gateway-tls.yaml b/deployments/charts/service/templates/gateway-tls.yaml new file mode 100644 index 000000000..0a87a3541 --- /dev/null +++ b/deployments/charts/service/templates/gateway-tls.yaml @@ -0,0 +1,196 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +{{/* +Gateway -> upstream TLS materials. + +Two modes, controlled by gateway.tls.certManager.enabled: + + A1 (default, certManager.enabled=false): Helm self-signs a CA and per-service + server certs. We use `lookup` so existing certs are reused across + `helm upgrade` — otherwise every release would rotate certs and break live + connections. The CA validity is 10 years; leaf certs are 5 years. Long + enough that rotation is a deliberate operation, short enough that the + private keys aren't immortal. + + A2 (certManager.enabled=true): emit cert-manager Issuer + Certificate + resources. cert-manager owns rotation. The Issuer is self-signed by + default; users can point at an existing ClusterIssuer (e.g. Vault, an + internal PKI) via gateway.tls.certManager.issuerRef. +*/}} + +{{- if .Values.gateway.tls.enabled }} +{{- $gw := .Values.gateway }} +{{- $gwName := include "osmo.gateway-name" . }} +{{- $ns := .Release.Namespace }} + +{{/* + Upstream service definitions. Each entry produces one server-cert Secret with + SANs covering the in-cluster DNS names Envoy uses to reach the service. +*/}} +{{- $upstreams := list + (dict "name" "osmo-service" "secret" "osmo-service-tls" + "dnsNames" (list "osmo-service" (printf "osmo-service.%s" $ns) (printf "osmo-service.%s.svc" $ns) (printf "osmo-service.%s.svc.cluster.local" $ns))) + (dict "name" "osmo-router" "secret" "osmo-router-tls" + "dnsNames" (list "osmo-router" "osmo-router-headless" + (printf "osmo-router.%s" $ns) (printf "osmo-router.%s.svc" $ns) (printf "osmo-router.%s.svc.cluster.local" $ns) + (printf "osmo-router-headless.%s" $ns) (printf "osmo-router-headless.%s.svc" $ns) (printf "osmo-router-headless.%s.svc.cluster.local" $ns))) + (dict "name" "osmo-agent" "secret" "osmo-agent-tls" + "dnsNames" (list "osmo-agent" (printf "osmo-agent.%s" $ns) (printf "osmo-agent.%s.svc" $ns) (printf "osmo-agent.%s.svc.cluster.local" $ns))) + (dict "name" "osmo-logger" "secret" "osmo-logger-tls" + "dnsNames" (list "osmo-logger" (printf "osmo-logger.%s" $ns) (printf "osmo-logger.%s.svc" $ns) (printf "osmo-logger.%s.svc.cluster.local" $ns))) +}} + +{{- if $gw.tls.certManager.enabled }} + +{{/* ===================================================================== */}} +{{/* A2 — cert-manager */}} +{{/* ===================================================================== */}} + +{{- if not $gw.tls.certManager.issuerRef }} +{{/* No external issuer — create a self-signed Issuer + a CA Certificate */}} +{{/* and a CA Issuer that signs the per-service certs. */}} +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: {{ $gwName }}-selfsigned +spec: + selfSigned: {} + +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ $gwName }}-ca +spec: + isCA: true + commonName: {{ $gwName }}-ca + secretName: {{ $gwName }}-ca-tls + duration: {{ $gw.tls.caDuration | default "87600h" }} + renewBefore: {{ $gw.tls.caRenewBefore | default "720h" }} + privateKey: + algorithm: ECDSA + size: 256 + issuerRef: + name: {{ $gwName }}-selfsigned + kind: Issuer + group: cert-manager.io + +--- +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: {{ $gwName }}-ca +spec: + ca: + secretName: {{ $gwName }}-ca-tls +{{- end }} + +{{- $issuerName := $gw.tls.certManager.issuerRef.name | default (printf "%s-ca" $gwName) }} +{{- $issuerKind := $gw.tls.certManager.issuerRef.kind | default "Issuer" }} +{{- $issuerGroup := $gw.tls.certManager.issuerRef.group | default "cert-manager.io" }} + +{{- range $u := $upstreams }} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ $u.name }} +spec: + secretName: {{ $u.secret }} + duration: {{ $gw.tls.certDuration | default "43800h" }} + renewBefore: {{ $gw.tls.certRenewBefore | default "360h" }} + commonName: {{ $u.name }} + dnsNames: + {{- range $u.dnsNames }} + - {{ . }} + {{- end }} + privateKey: + algorithm: ECDSA + size: 256 + issuerRef: + name: {{ $issuerName }} + kind: {{ $issuerKind }} + group: {{ $issuerGroup }} +{{- end }} + +{{- else }} + +{{/* ===================================================================== */}} +{{/* A1 — Helm self-signed */}} +{{/* ===================================================================== */}} + +{{/* + Look up an existing CA secret. If found, reuse it so `helm upgrade` doesn't + rotate the CA on every release. If not, generate one. Use buildCustomCert + to reconstruct the sprig.certificate struct that genSignedCert requires. +*/}} +{{- $caSecret := lookup "v1" "Secret" $ns (printf "%s-ca-tls" $gwName) }} +{{- $caCertB64 := "" }} +{{- $caKeyB64 := "" }} +{{- if and $caSecret $caSecret.data (index $caSecret.data "ca.crt") (index $caSecret.data "ca.key") }} + {{- $caCertB64 = index $caSecret.data "ca.crt" }} + {{- $caKeyB64 = index $caSecret.data "ca.key" }} +{{- else }} + {{- $generated := genCA (printf "%s-ca" $gwName) 3650 }} + {{- $caCertB64 = $generated.Cert | b64enc }} + {{- $caKeyB64 = $generated.Key | b64enc }} +{{- end }} +{{- $ca := buildCustomCert $caCertB64 $caKeyB64 }} + +apiVersion: v1 +kind: Secret +metadata: + name: {{ $gwName }}-ca-tls + annotations: + "helm.sh/resource-policy": keep +type: Opaque +data: + ca.crt: {{ $caCertB64 }} + ca.key: {{ $caKeyB64 }} + +{{- range $u := $upstreams }} +--- +{{/* + Per-service leaf cert. Reuse on upgrade if a usable secret already exists; + otherwise sign a fresh one with the CA above. 5-year validity. +*/}} +{{- $existing := lookup "v1" "Secret" $ns $u.secret }} +{{- $leafCrtB64 := "" }} +{{- $leafKeyB64 := "" }} +{{- if and $existing $existing.data (index $existing.data "tls.crt") (index $existing.data "tls.key") }} + {{- $leafCrtB64 = index $existing.data "tls.crt" }} + {{- $leafKeyB64 = index $existing.data "tls.key" }} +{{- else }} + {{- $cert := genSignedCert $u.name nil $u.dnsNames 1825 $ca }} + {{- $leafCrtB64 = $cert.Cert | b64enc }} + {{- $leafKeyB64 = $cert.Key | b64enc }} +{{- end }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ $u.secret }} + annotations: + "helm.sh/resource-policy": keep +type: kubernetes.io/tls +data: + tls.crt: {{ $leafCrtB64 }} + tls.key: {{ $leafKeyB64 }} + ca.crt: {{ $caCertB64 }} +{{- end }} + +{{- end }}{{/* end A1/A2 branch */}} +{{- end }}{{/* end gateway.tls.enabled */}} diff --git a/deployments/charts/service/templates/logger-service.yaml b/deployments/charts/service/templates/logger-service.yaml index 9644de8b8..4ee27693f 100644 --- a/deployments/charts/service/templates/logger-service.yaml +++ b/deployments/charts/service/templates/logger-service.yaml @@ -125,6 +125,7 @@ spec: {{- range $arg := .Values.services.logger.extraArgs }} - {{ $arg | quote }} {{- end }} + {{- include "osmo.upstream-tls-args" (dict "context" .) | nindent 8 }} env: {{- include "osmo.configmap-env" . | nindent 8 }} {{- include "osmo.extra-env" .Values.services.logger | nindent 8 }} @@ -145,7 +146,7 @@ spec: {{- end }} imagePullPolicy: {{ .Values.services.logger.imagePullPolicy }} ports: - {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.logger.extraVolumeMounts }} + {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.logger.extraVolumeMounts .Values.gateway.tls.enabled }} volumeMounts: {{- end }} {{- if .Values.services.configFile.enabled}} @@ -159,6 +160,7 @@ spec: mountPath: /logs {{- end }} {{- include "osmo.extra-volume-mounts" .Values.services.logger | nindent 8 }} + {{- include "osmo.upstream-tls-volume-mount" (dict "context" .) | nindent 8 }} resources: {{- toYaml .Values.services.logger.resources | nindent 10 }} @@ -193,6 +195,9 @@ spec: httpGet: port: 8000 path: /health + {{- if .Values.gateway.tls.enabled }} + scheme: HTTPS + {{- end }} periodSeconds: 45 failureThreshold: 3 timeoutSeconds: 20 @@ -201,6 +206,7 @@ spec: {{- include "osmo.extra-sidecars" .Values.services.logger | nindent 6 }} volumes: {{- include "osmo.extra-volumes" .Values.services.logger | nindent 8 }} + {{- include "osmo.upstream-tls-volume" (dict "context" . "secret" "osmo-logger-tls") | nindent 8 }} {{- if .Values.global.logs.enabled }} - name: logs emptyDir: {} diff --git a/deployments/charts/service/templates/router-service.yaml b/deployments/charts/service/templates/router-service.yaml index ea1aa7536..2bcc7b628 100644 --- a/deployments/charts/service/templates/router-service.yaml +++ b/deployments/charts/service/templates/router-service.yaml @@ -120,6 +120,7 @@ spec: - {{ . }} {{- end }} {{- end }} + {{- include "osmo.upstream-tls-args" (dict "context" .) | nindent 8 }} env: - name: OSMO_SCHEMA_VERSION value: {{ .Values.targetSchema | default "public" }} @@ -162,7 +163,7 @@ spec: protocol: {{ .protocol | default "TCP" }} {{- end }} {{- end }} - {{- if or .Values.global.logs.enabled .Values.services.configFile.enabled .Values.services.router.extraVolumeMounts }} + {{- if or .Values.global.logs.enabled .Values.services.configFile.enabled .Values.services.router.extraVolumeMounts .Values.gateway.tls.enabled }} volumeMounts: {{- if .Values.services.configFile.enabled}} - mountPath: {{ .Values.services.configFile.path }} @@ -176,23 +177,24 @@ spec: {{- with .Values.services.router.extraVolumeMounts }} {{- toYaml . | nindent 8 }} {{- end }} + {{- include "osmo.upstream-tls-volume-mount" (dict "context" .) | nindent 8 }} {{- end }} resources: {{- toYaml .Values.services.router.resources | nindent 10 }} {{- with .Values.services.router.livenessProbe }} livenessProbe: - {{- toYaml . | nindent 10 }} + {{- include "osmo.upstream-probe-yaml" (dict "probe" . "context" $) | nindent 10 }} {{- end }} {{- with .Values.services.router.startupProbe }} startupProbe: - {{- toYaml . | nindent 10 }} + {{- include "osmo.upstream-probe-yaml" (dict "probe" . "context" $) | nindent 10 }} {{- end }} {{- with .Values.services.router.readinessProbe }} readinessProbe: - {{- toYaml . | nindent 10 }} + {{- include "osmo.upstream-probe-yaml" (dict "probe" . "context" $) | nindent 10 }} {{- end }} {{- with .Values.services.router.extraContainers }} @@ -215,6 +217,7 @@ spec: {{- with .Values.services.router.extraVolumes }} {{- toYaml . | nindent 8 }} {{- end }} + {{- include "osmo.upstream-tls-volume" (dict "context" . "secret" "osmo-router-tls") | nindent 8 }} --- diff --git a/deployments/charts/service/values.yaml b/deployments/charts/service/values.yaml index 925cc2e16..3104e6318 100644 --- a/deployments/charts/service/values.yaml +++ b/deployments/charts/service/values.yaml @@ -2108,10 +2108,55 @@ gateway: port: 8000 ## ----------------------------------------------------------------------- - ## TLS — self-signed certificates for encrypted gateway-to-upstream - ## communication. When enabled, Helm generates a CA and server cert; - ## Envoy uses the CA to verify upstream connections. The upstream - ## services must also be configured to serve HTTPS. + ## TLS — encrypted gateway-to-upstream communication. + ## + ## When enabled: + ## - Cert material is generated (see two modes below) + ## - Envoy validates upstreams against the CA + ## - Upstream Deployments mount their per-service Secret and start uvicorn + ## with --ssl_keyfile / --ssl_certfile so they serve HTTPS on :8000 + ## - Probes use scheme: HTTPS + ## + ## Two modes for cert material: + ## + ## A1 (default, certManager.enabled=false): + ## Helm self-signs a CA + per-service leaf certs using genCA / + ## genSignedCert with `lookup` so existing certs are reused across + ## `helm upgrade`. Zero external dependencies. + ## + ## A2 (certManager.enabled=true): + ## Emit cert-manager Issuer + Certificate resources. Provide your own + ## issuerRef (e.g. a Vault ClusterIssuer) or omit it to let the chart + ## create a self-signed Issuer + CA chain. Production-grade rotation. + ## + ## The UI is intentionally NOT served over TLS — it stays HTTP behind + ## NetworkPolicy. Only osmo-service, osmo-router, osmo-agent, and + ## osmo-logger get TLS server certs. ## ----------------------------------------------------------------------- tls: enabled: false + + ## CA cert validity (cert-manager mode). 10 years by default — long + ## enough that rotation is a deliberate operation. + caDuration: 87600h + caRenewBefore: 720h + + ## Leaf-cert validity (cert-manager mode). 5 years by default. + certDuration: 43800h + certRenewBefore: 360h + + certManager: + ## Use cert-manager (A2) instead of Helm-generated certs (A1). + ## Requires cert-manager installed in the cluster. + enabled: false + + ## Optional: point at an existing Issuer/ClusterIssuer instead of + ## letting the chart create a self-signed CA chain. Set name + kind + ## (Issuer or ClusterIssuer) + group. + ## + ## Example: + ## issuerRef: + ## name: vault-issuer + ## kind: ClusterIssuer + ## group: cert-manager.io + issuerRef: {} diff --git a/src/service/agent/agent_service.py b/src/service/agent/agent_service.py index b708e3ed8..a4f0e5465 100644 --- a/src/service/agent/agent_service.py +++ b/src/service/agent/agent_service.py @@ -42,7 +42,8 @@ # mixin BackendServiceConfig.load() would reject the unknown flag and crash. class BackendServiceConfig(connectors.RedisConfig, connectors.PostgresConfig, src.lib.utils.logging.LoggingConfig, - static_config.StaticConfig, ConfigFileMixin): + static_config.StaticConfig, + static_config.SSLConfig, ConfigFileMixin): """Config settings for the backend service""" progress_period: int = pydantic.Field( default=30, @@ -138,7 +139,8 @@ async def liveness_update(): await asyncio.sleep(agent_service_config.progress_period) async def run_server(): - uvicorn_config = uvicorn.Config(app, host=host, port=port, log_config=None) + uvicorn_config = uvicorn.Config(app, host=host, port=port, log_config=None, + **config.uvicorn_ssl_kwargs()) uvicorn_server = uvicorn.Server(config=uvicorn_config) liveness_task = asyncio.create_task(liveness_update()) try: diff --git a/src/service/core/service.py b/src/service/core/service.py index f6b94a14b..a539567bc 100644 --- a/src/service/core/service.py +++ b/src/service/core/service.py @@ -545,7 +545,7 @@ def main(): port = 8000 try: - uvicorn.run(app, host=host, port=port, log_config=None) + uvicorn.run(app, host=host, port=port, log_config=None, **config.uvicorn_ssl_kwargs()) except KeyboardInterrupt: sys.exit(0) diff --git a/src/service/core/workflow/objects.py b/src/service/core/workflow/objects.py index 8a3e0d29d..ab5838040 100644 --- a/src/service/core/workflow/objects.py +++ b/src/service/core/workflow/objects.py @@ -40,6 +40,7 @@ class WorkflowServiceConfig(connectors.RedisConfig, connectors.PostgresConfig, src.lib.utils.logging.LoggingConfig, static_config.StaticConfig, + static_config.SSLConfig, metrics.MetricsCreatorConfig, ConfigFileMixin): """ Manages configuration specific to the workflow service. """ diff --git a/src/service/logger/logger.py b/src/service/logger/logger.py index a3637e4cf..cea943da2 100644 --- a/src/service/logger/logger.py +++ b/src/service/logger/logger.py @@ -34,7 +34,8 @@ class LoggerServiceConfig(connectors.RedisConfig, connectors.PostgresConfig, src.lib.utils.logging.LoggingConfig, - static_config.StaticConfig, ConfigFileMixin): + static_config.StaticConfig, + static_config.SSLConfig, ConfigFileMixin): """Config settings for the logger service""" host: str = pydantic.Field( default='http://0.0.0.0:8000', @@ -89,7 +90,8 @@ async def liveness_update(): await asyncio.sleep(config.progress_period) async def run_server(): - uvicorn_config = uvicorn.Config(app, host=host, port=port, log_config=None) + uvicorn_config = uvicorn.Config(app, host=host, port=port, log_config=None, + **config.uvicorn_ssl_kwargs()) uvicorn_server = uvicorn.Server(config=uvicorn_config) liveness_task = asyncio.create_task(liveness_update()) try: diff --git a/src/service/router/router.py b/src/service/router/router.py index 63c38e7ea..0e46a4991 100644 --- a/src/service/router/router.py +++ b/src/service/router/router.py @@ -36,7 +36,7 @@ class RouterServiceConfig(src.lib.utils.logging.LoggingConfig, static_config.StaticConfig, - connectors.PostgresConfig): + static_config.SSLConfig, connectors.PostgresConfig): """Config settings for the logger service""" host: str = pydantic.Field( default='http://0.0.0.0:8000', @@ -423,7 +423,8 @@ def main(): connectors.PostgresConnector(config) async def run_server(): - uvicorn_config = uvicorn.Config(app, host=host, port=port, log_config=None) + uvicorn_config = uvicorn.Config(app, host=host, port=port, log_config=None, + **config.uvicorn_ssl_kwargs()) uvicorn_server = uvicorn.Server(config=uvicorn_config) check_timeout_task = asyncio.create_task(check_webserver_timeout()) try: diff --git a/src/utils/static_config.py b/src/utils/static_config.py index fe226efdc..43bee357e 100644 --- a/src/utils/static_config.py +++ b/src/utils/static_config.py @@ -35,6 +35,31 @@ def _get_field_extras(field: FieldInfo) -> Dict[str, Any]: return {} +class SSLConfig(pydantic.BaseModel): + """TLS/SSL configuration for the uvicorn listener. + + Mixed into every service config so a single set of --ssl_keyfile / --ssl_certfile + flags works uniformly. When both files are set, uvicorn serves HTTPS; otherwise + HTTP. Used by the gateway -> upstream TLS path (gateway.tls.enabled in the chart). + """ + ssl_keyfile: Optional[str] = pydantic.Field( + default=None, + description='Path to a PEM-encoded private key. If set together with ' + 'ssl_certfile, the service serves HTTPS instead of HTTP.', + json_schema_extra={'command_line': 'ssl_keyfile', 'env': 'OSMO_SSL_KEYFILE'}) + ssl_certfile: Optional[str] = pydantic.Field( + default=None, + description='Path to a PEM-encoded certificate (server leaf, optionally ' + 'chained). Required together with ssl_keyfile.', + json_schema_extra={'command_line': 'ssl_certfile', 'env': 'OSMO_SSL_CERTFILE'}) + + def uvicorn_ssl_kwargs(self) -> Dict[str, Any]: + """Return uvicorn keyword args for TLS, or an empty dict if TLS is off.""" + if self.ssl_keyfile and self.ssl_certfile: + return {'ssl_keyfile': self.ssl_keyfile, 'ssl_certfile': self.ssl_certfile} + return {} + + class StaticConfig(pydantic.BaseModel): """ A class for reading in config information from either command line, files, or environment variables """ From f7d71c63113e34525bb84d3dc0018d55c53ecaf5 Mon Sep 17 00:00:00 2001 From: Ryan Li Date: Tue, 5 May 2026 13:53:59 -0700 Subject: [PATCH 2/9] Update envoy --- deployments/charts/service/README.md | 27 ++-- .../templates/_gateway-envoy-config.tpl | 22 +++- .../service/templates/_gateway-helpers.tpl | 27 ++-- .../service/templates/agent-service.yaml | 2 +- .../charts/service/templates/api-service.yaml | 2 +- .../charts/service/templates/gateway-tls.yaml | 116 ++++-------------- .../charts/service/templates/gateway.yaml | 4 +- .../service/templates/logger-service.yaml | 2 +- .../service/templates/router-service.yaml | 2 +- deployments/charts/service/values.yaml | 66 +++------- src/utils/static_config.py | 83 ++++++++++++- 11 files changed, 178 insertions(+), 175 deletions(-) diff --git a/deployments/charts/service/README.md b/deployments/charts/service/README.md index abbb62c3b..1e3898f69 100644 --- a/deployments/charts/service/README.md +++ b/deployments/charts/service/README.md @@ -405,26 +405,27 @@ Envoy uses filesystem-based dynamic configuration (LDS/CDS). When the ConfigMap #### Gateway → Upstream TLS -When `gateway.tls.enabled` is `true`, traffic between the Envoy gateway and the upstream services (`osmo-service`, `osmo-router`, `osmo-agent`, `osmo-logger`) is encrypted end-to-end. The Envoy verifies upstream certs against a CA, and each upstream serves HTTPS via uvicorn's `--ssl_keyfile` / `--ssl_certfile`. The UI intentionally stays on plain HTTP and relies on `gateway.networkPolicies` for ingress restriction (Next.js does not natively serve TLS). +Traffic between the Envoy gateway and the upstream services (`osmo-service`, `osmo-router`, `osmo-agent`, `osmo-logger`) is encrypted by default. The UI intentionally stays on plain HTTP behind NetworkPolicy — Next.js does not natively serve TLS. -Cert material can be provisioned in two ways: +Two modes: -- **A1 (default)** — the chart self-signs a CA (`-ca-tls`) and per-service leaf certs (`osmo-service-tls`, `osmo-router-tls`, `osmo-agent-tls`, `osmo-logger-tls`) using Sprig's `genCA` / `genSignedCert`. The `lookup` function reuses existing secrets across `helm upgrade` so certs aren't rotated on every release. Zero external dependencies. Default validity is 10 years (CA) and 5 years (leaf). -- **A2 (`gateway.tls.certManager.enabled: true`)** — the chart emits cert-manager `Issuer` + `Certificate` resources. By default it creates a self-signed root + a CA Issuer + per-service Certificates. To plug in an existing CA (Vault, internal PKI, ACME), set `gateway.tls.certManager.issuerRef`. Requires cert-manager installed in the cluster. +**Default — encryption without validation.** Each upstream service mints its own ephemeral self-signed cert in-process at startup (ECDSA P-256, ~1ms), writes it to a temp dir, and loads it into uvicorn's SSLContext. The Python service does this via `--ssl_self_signed true` from the chart; the cert generation happens in `SSLConfig._mint_ephemeral_self_signed()` (`src/utils/static_config.py`). Envoy connects with TLS but configures `common_tls_context: {}` on the upstream cluster — it does *not* validate the cert. The wire is encrypted; identity verification is delegated to NetworkPolicy + Kubernetes RBAC. + +This means: no CA management, no Secrets to rotate, no ArgoCD churn, no init containers, no cross-pod cert dependency. Cert lifecycle is tied to process lifecycle — a pod restart mints a fresh cert. + +**Validated — cert-manager.** Set `gateway.tls.certManager.enabled: true`. The chart emits cert-manager `Issuer` + `Certificate` resources. By default it creates a self-signed root + a CA Issuer + per-service Certificates; the upstream Deployments mount the resulting Secrets read-only. To plug in an existing CA (Vault, internal PKI, ACME), set `gateway.tls.certManager.issuerRef`. Requires cert-manager installed in the cluster. | Parameter | Description | Default | |-----------|-------------|---------| -| `gateway.tls.enabled` | Encrypt gateway → upstream traffic. Generates certs (mode chosen below), wires upstream Deployments to serve HTTPS, and adds `UpstreamTlsContext` + `sni:` to each Envoy cluster. | `false` | -| `gateway.tls.caDuration` | CA cert validity (cert-manager mode) | `87600h` (10y) | -| `gateway.tls.caRenewBefore` | Renew CA this long before expiry (cert-manager mode) | `720h` (30d) | -| `gateway.tls.certDuration` | Leaf cert validity (cert-manager mode) | `43800h` (5y) | -| `gateway.tls.certRenewBefore` | Renew leaf this long before expiry (cert-manager mode) | `360h` (15d) | -| `gateway.tls.certManager.enabled` | Use cert-manager to manage cert lifecycle (A2). Requires cert-manager. | `false` | +| `gateway.tls.enabled` | Encrypt gateway → upstream traffic. | `true` | +| `gateway.tls.caDuration` | CA cert validity (cert-manager mode). | `87600h` (10y) | +| `gateway.tls.caRenewBefore` | Renew CA this long before expiry (cert-manager mode). | `720h` (30d) | +| `gateway.tls.certDuration` | Leaf cert validity (cert-manager mode). | `43800h` (5y) | +| `gateway.tls.certRenewBefore` | Renew leaf this long before expiry (cert-manager mode). | `360h` (15d) | +| `gateway.tls.certManager.enabled` | Switch from default mode to cert-manager-managed validated TLS. | `false` | | `gateway.tls.certManager.issuerRef` | Optional: point at an existing Issuer/ClusterIssuer. Map with `name`, `kind` (`Issuer` or `ClusterIssuer`), and `group` (defaults to `cert-manager.io`). When empty, the chart creates a self-signed Issuer + CA chain. | `{}` | -The Helm-mode (A1) Secrets are annotated with `helm.sh/resource-policy: keep` so a `helm uninstall` won't shred the CA — protects against accidental rotation. - -NetworkPolicy and TLS are independent: NetworkPolicy controls *who* can connect at L3/L4; TLS encrypts the bytes at L7. Run them together for defense in depth, or either alone. +NetworkPolicy and TLS are independent: NetworkPolicy controls *who* can connect at L3/L4; TLS encrypts the bytes at L7. Run them together for defense in depth. ### Extensibility diff --git a/deployments/charts/service/templates/_gateway-envoy-config.tpl b/deployments/charts/service/templates/_gateway-envoy-config.tpl index 896a44d1a..d882cb1de 100644 --- a/deployments/charts/service/templates/_gateway-envoy-config.tpl +++ b/deployments/charts/service/templates/_gateway-envoy-config.tpl @@ -70,7 +70,7 @@ data: filename: /etc/ssl/envoy-certs/tls.key {{- end }} - {{- if $gw.tls.enabled }} + {{- if and $gw.tls.enabled $gw.tls.certManager.enabled }} sds_upstream_ca.yaml: | resources: - "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.Secret @@ -579,6 +579,7 @@ data: typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext sni: {{ $gw.upstreams.service.host }} + {{- if $gw.tls.certManager.enabled }} common_tls_context: validation_context_sds_secret_config: name: upstream_ca @@ -587,6 +588,9 @@ data: path: /var/config/sds_upstream_ca.yaml watched_directory: path: /var/config + {{- else }} + common_tls_context: {} + {{- end }} {{- end }} {{- if $gw.upstreams.router.enabled }} @@ -613,6 +617,7 @@ data: typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext sni: {{ $gw.upstreams.router.host }} + {{- if $gw.tls.certManager.enabled }} common_tls_context: validation_context_sds_secret_config: name: upstream_ca @@ -621,6 +626,9 @@ data: path: /var/config/sds_upstream_ca.yaml watched_directory: path: /var/config + {{- else }} + common_tls_context: {} + {{- end }} {{- end }} {{- end }} @@ -670,6 +678,7 @@ data: typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext sni: {{ $gw.upstreams.agent.host }} + {{- if $gw.tls.certManager.enabled }} common_tls_context: validation_context_sds_secret_config: name: upstream_ca @@ -678,6 +687,9 @@ data: path: /var/config/sds_upstream_ca.yaml watched_directory: path: /var/config + {{- else }} + common_tls_context: {} + {{- end }} {{- end }} {{- end }} @@ -703,6 +715,7 @@ data: typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext sni: {{ $gw.upstreams.logger.host }} + {{- if $gw.tls.certManager.enabled }} common_tls_context: validation_context_sds_secret_config: name: upstream_ca @@ -711,6 +724,9 @@ data: path: /var/config/sds_upstream_ca.yaml watched_directory: path: /var/config + {{- else }} + common_tls_context: {} + {{- end }} {{- end }} {{- end }} @@ -823,6 +839,7 @@ data: typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext sni: {{ $jwksHost }} + {{- if $gw.tls.certManager.enabled }} common_tls_context: validation_context_sds_secret_config: name: upstream_ca @@ -831,6 +848,9 @@ data: path: /var/config/sds_upstream_ca.yaml watched_directory: path: /var/config + {{- else }} + common_tls_context: {} + {{- end }} {{- end }} {{- end }} diff --git a/deployments/charts/service/templates/_gateway-helpers.tpl b/deployments/charts/service/templates/_gateway-helpers.tpl index a64295e36..9e424c0ae 100644 --- a/deployments/charts/service/templates/_gateway-helpers.tpl +++ b/deployments/charts/service/templates/_gateway-helpers.tpl @@ -32,27 +32,35 @@ app.kubernetes.io/component: {{ .component }} {{- end }} {{/* -Per-upstream TLS args. Used by api-service, agent-service, logger-service, -router-service when gateway.tls.enabled is true. Pass a dict with "context". +Per-upstream TLS args. Pass a dict with "context". -Outputs --ssl_keyfile and --ssl_certfile, indented to fit container args lists. -The mount path is fixed at /etc/osmo/tls; the cert is provided by the -{name}-tls Secret created by gateway-tls.yaml. +Default mode (no cert-manager): emits --ssl_self_signed true. The Python +service mints a fresh self-signed cert at process start and points uvicorn +at it. No init container, no volume, no Secret on the chart side. + +cert-manager mode: emits --ssl_keyfile / --ssl_certfile pointing at the +PEMs mounted from the cert-manager-managed Secret at /etc/osmo/tls. */}} {{- define "osmo.upstream-tls-args" -}} {{- if .context.Values.gateway.tls.enabled }} +{{- if .context.Values.gateway.tls.certManager.enabled }} - --ssl_keyfile - /etc/osmo/tls/tls.key - --ssl_certfile - /etc/osmo/tls/tls.crt +{{- else }} +- --ssl_self_signed +- "true" +{{- end }} {{- end }} {{- end }} {{/* -TLS volume mount for an upstream container. Use under volumeMounts. +TLS volume mount for an upstream container. Only emitted in cert-manager +mode — default mode keeps cert material in an in-process tempdir. */}} {{- define "osmo.upstream-tls-volume-mount" -}} -{{- if .context.Values.gateway.tls.enabled }} +{{- if and .context.Values.gateway.tls.enabled .context.Values.gateway.tls.certManager.enabled }} - name: tls mountPath: /etc/osmo/tls readOnly: true @@ -61,10 +69,11 @@ TLS volume mount for an upstream container. Use under volumeMounts. {{/* TLS volume for an upstream pod. Pass dict with "context" and "secret" (the -per-service Secret name, e.g. "osmo-service-tls"). Use under volumes. +per-service Secret name in cert-manager mode, e.g. "osmo-service-tls"). +Only emitted in cert-manager mode. */}} {{- define "osmo.upstream-tls-volume" -}} -{{- if .context.Values.gateway.tls.enabled }} +{{- if and .context.Values.gateway.tls.enabled .context.Values.gateway.tls.certManager.enabled }} - name: tls secret: secretName: {{ .secret }} diff --git a/deployments/charts/service/templates/agent-service.yaml b/deployments/charts/service/templates/agent-service.yaml index 080a5f727..53c03a366 100644 --- a/deployments/charts/service/templates/agent-service.yaml +++ b/deployments/charts/service/templates/agent-service.yaml @@ -155,7 +155,7 @@ spec: {{- end }} imagePullPolicy: {{ .Values.services.agent.imagePullPolicy }} ports: - {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.agent.extraVolumeMounts .Values.gateway.tls.enabled }} + {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.agent.extraVolumeMounts (and .Values.gateway.tls.enabled .Values.gateway.tls.certManager.enabled) }} volumeMounts: {{- end }} {{- if .Values.services.configFile.enabled}} diff --git a/deployments/charts/service/templates/api-service.yaml b/deployments/charts/service/templates/api-service.yaml index a7209be0b..20ae848fd 100644 --- a/deployments/charts/service/templates/api-service.yaml +++ b/deployments/charts/service/templates/api-service.yaml @@ -194,7 +194,7 @@ spec: ports: - name: metrics containerPort: 9464 - {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.service.extraVolumeMounts .Values.gateway.tls.enabled }} + {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.service.extraVolumeMounts (and .Values.gateway.tls.enabled .Values.gateway.tls.certManager.enabled) }} volumeMounts: {{- end }} {{- if .Values.services.configFile.enabled}} diff --git a/deployments/charts/service/templates/gateway-tls.yaml b/deployments/charts/service/templates/gateway-tls.yaml index 0a87a3541..a2cdf2b6b 100644 --- a/deployments/charts/service/templates/gateway-tls.yaml +++ b/deployments/charts/service/templates/gateway-tls.yaml @@ -15,32 +15,33 @@ # SPDX-License-Identifier: Apache-2.0 {{/* -Gateway -> upstream TLS materials. - -Two modes, controlled by gateway.tls.certManager.enabled: - - A1 (default, certManager.enabled=false): Helm self-signs a CA and per-service - server certs. We use `lookup` so existing certs are reused across - `helm upgrade` — otherwise every release would rotate certs and break live - connections. The CA validity is 10 years; leaf certs are 5 years. Long - enough that rotation is a deliberate operation, short enough that the - private keys aren't immortal. - - A2 (certManager.enabled=true): emit cert-manager Issuer + Certificate - resources. cert-manager owns rotation. The Issuer is self-signed by - default; users can point at an existing ClusterIssuer (e.g. Vault, an - internal PKI) via gateway.tls.certManager.issuerRef. +Gateway -> upstream TLS. + +Default mode (gateway.tls.enabled=true, certManager.enabled=false): + Each upstream pod runs an init container that generates an ephemeral + self-signed cert into an emptyDir. uvicorn serves HTTPS with that cert. + Envoy connects with TLS but does NOT validate (common_tls_context: {} on + the upstream cluster). The wire is encrypted; identity verification is + handled by NetworkPolicy + Kubernetes RBAC, not by cert pinning. + + This template emits nothing in default mode. The init containers and + emptyDir volumes are wired into each Deployment via helpers in + _gateway-helpers.tpl, and the Envoy CDS uses an empty + common_tls_context. Result: no Secrets, no Jobs, no CA, no rotation — + cert lifecycle is tied to pod lifecycle. + +cert-manager mode (gateway.tls.enabled=true, certManager.enabled=true): + cert-manager manages a CA and per-service leaf certs as proper Secrets. + Envoy validates upstreams against that CA. Use this when you need + validated/audited TLS or want to plug in an enterprise PKI. */}} {{- if .Values.gateway.tls.enabled }} +{{- if .Values.gateway.tls.certManager.enabled }} {{- $gw := .Values.gateway }} {{- $gwName := include "osmo.gateway-name" . }} {{- $ns := .Release.Namespace }} -{{/* - Upstream service definitions. Each entry produces one server-cert Secret with - SANs covering the in-cluster DNS names Envoy uses to reach the service. -*/}} {{- $upstreams := list (dict "name" "osmo-service" "secret" "osmo-service-tls" "dnsNames" (list "osmo-service" (printf "osmo-service.%s" $ns) (printf "osmo-service.%s.svc" $ns) (printf "osmo-service.%s.svc.cluster.local" $ns))) @@ -54,15 +55,9 @@ Two modes, controlled by gateway.tls.certManager.enabled: "dnsNames" (list "osmo-logger" (printf "osmo-logger.%s" $ns) (printf "osmo-logger.%s.svc" $ns) (printf "osmo-logger.%s.svc.cluster.local" $ns))) }} -{{- if $gw.tls.certManager.enabled }} - -{{/* ===================================================================== */}} -{{/* A2 — cert-manager */}} -{{/* ===================================================================== */}} - {{- if not $gw.tls.certManager.issuerRef }} -{{/* No external issuer — create a self-signed Issuer + a CA Certificate */}} -{{/* and a CA Issuer that signs the per-service certs. */}} +{{/* No external issuer — create a self-signed Issuer + a CA Certificate + and a CA Issuer that signs the per-service certs. */}} apiVersion: cert-manager.io/v1 kind: Issuer metadata: @@ -127,70 +122,5 @@ spec: group: {{ $issuerGroup }} {{- end }} -{{- else }} - -{{/* ===================================================================== */}} -{{/* A1 — Helm self-signed */}} -{{/* ===================================================================== */}} - -{{/* - Look up an existing CA secret. If found, reuse it so `helm upgrade` doesn't - rotate the CA on every release. If not, generate one. Use buildCustomCert - to reconstruct the sprig.certificate struct that genSignedCert requires. -*/}} -{{- $caSecret := lookup "v1" "Secret" $ns (printf "%s-ca-tls" $gwName) }} -{{- $caCertB64 := "" }} -{{- $caKeyB64 := "" }} -{{- if and $caSecret $caSecret.data (index $caSecret.data "ca.crt") (index $caSecret.data "ca.key") }} - {{- $caCertB64 = index $caSecret.data "ca.crt" }} - {{- $caKeyB64 = index $caSecret.data "ca.key" }} -{{- else }} - {{- $generated := genCA (printf "%s-ca" $gwName) 3650 }} - {{- $caCertB64 = $generated.Cert | b64enc }} - {{- $caKeyB64 = $generated.Key | b64enc }} -{{- end }} -{{- $ca := buildCustomCert $caCertB64 $caKeyB64 }} - -apiVersion: v1 -kind: Secret -metadata: - name: {{ $gwName }}-ca-tls - annotations: - "helm.sh/resource-policy": keep -type: Opaque -data: - ca.crt: {{ $caCertB64 }} - ca.key: {{ $caKeyB64 }} - -{{- range $u := $upstreams }} ---- -{{/* - Per-service leaf cert. Reuse on upgrade if a usable secret already exists; - otherwise sign a fresh one with the CA above. 5-year validity. -*/}} -{{- $existing := lookup "v1" "Secret" $ns $u.secret }} -{{- $leafCrtB64 := "" }} -{{- $leafKeyB64 := "" }} -{{- if and $existing $existing.data (index $existing.data "tls.crt") (index $existing.data "tls.key") }} - {{- $leafCrtB64 = index $existing.data "tls.crt" }} - {{- $leafKeyB64 = index $existing.data "tls.key" }} -{{- else }} - {{- $cert := genSignedCert $u.name nil $u.dnsNames 1825 $ca }} - {{- $leafCrtB64 = $cert.Cert | b64enc }} - {{- $leafKeyB64 = $cert.Key | b64enc }} -{{- end }} -apiVersion: v1 -kind: Secret -metadata: - name: {{ $u.secret }} - annotations: - "helm.sh/resource-policy": keep -type: kubernetes.io/tls -data: - tls.crt: {{ $leafCrtB64 }} - tls.key: {{ $leafKeyB64 }} - ca.crt: {{ $caCertB64 }} -{{- end }} - -{{- end }}{{/* end A1/A2 branch */}} +{{- end }}{{/* end certManager.enabled */}} {{- end }}{{/* end gateway.tls.enabled */}} diff --git a/deployments/charts/service/templates/gateway.yaml b/deployments/charts/service/templates/gateway.yaml index 56105245e..1951c4347 100644 --- a/deployments/charts/service/templates/gateway.yaml +++ b/deployments/charts/service/templates/gateway.yaml @@ -84,7 +84,7 @@ spec: - mountPath: /var/config name: envoy-config readOnly: true - {{- if $gw.tls.enabled }} + {{- if and $gw.tls.enabled $gw.tls.certManager.enabled }} - name: gateway-tls-ca mountPath: /etc/gateway-tls readOnly: true @@ -112,7 +112,7 @@ spec: - name: envoy-config configMap: name: {{ $gwName }}-envoy-config - {{- if $gw.tls.enabled }} + {{- if and $gw.tls.enabled $gw.tls.certManager.enabled }} - name: gateway-tls-ca secret: secretName: {{ $gwName }}-ca-tls diff --git a/deployments/charts/service/templates/logger-service.yaml b/deployments/charts/service/templates/logger-service.yaml index 4ee27693f..da89eac1a 100644 --- a/deployments/charts/service/templates/logger-service.yaml +++ b/deployments/charts/service/templates/logger-service.yaml @@ -146,7 +146,7 @@ spec: {{- end }} imagePullPolicy: {{ .Values.services.logger.imagePullPolicy }} ports: - {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.logger.extraVolumeMounts .Values.gateway.tls.enabled }} + {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.logger.extraVolumeMounts (and .Values.gateway.tls.enabled .Values.gateway.tls.certManager.enabled) }} volumeMounts: {{- end }} {{- if .Values.services.configFile.enabled}} diff --git a/deployments/charts/service/templates/router-service.yaml b/deployments/charts/service/templates/router-service.yaml index 2bcc7b628..d37a94fd6 100644 --- a/deployments/charts/service/templates/router-service.yaml +++ b/deployments/charts/service/templates/router-service.yaml @@ -163,7 +163,7 @@ spec: protocol: {{ .protocol | default "TCP" }} {{- end }} {{- end }} - {{- if or .Values.global.logs.enabled .Values.services.configFile.enabled .Values.services.router.extraVolumeMounts .Values.gateway.tls.enabled }} + {{- if or .Values.global.logs.enabled .Values.services.configFile.enabled .Values.services.router.extraVolumeMounts (and .Values.gateway.tls.enabled .Values.gateway.tls.certManager.enabled) }} volumeMounts: {{- if .Values.services.configFile.enabled}} - mountPath: {{ .Values.services.configFile.path }} diff --git a/deployments/charts/service/values.yaml b/deployments/charts/service/values.yaml index 3104e6318..a35fb6c68 100644 --- a/deployments/charts/service/values.yaml +++ b/deployments/charts/service/values.yaml @@ -2085,12 +2085,9 @@ gateway: nodeSelector: {} tolerations: [] - ## ----------------------------------------------------------------------- - ## Network Policies — restrict ingress to upstream service pods so that - ## only the gateway Envoy pods can reach them. Requires a CNI that - ## enforces NetworkPolicy (e.g. Calico, Cilium, or AWS VPC CNI with + ## Restrict upstream pod ingress to the gateway Envoy. Requires a CNI + ## that enforces NetworkPolicy (Calico, Cilium, AWS VPC CNI with ## enable-network-policy-controller=true). - ## ----------------------------------------------------------------------- networkPolicies: enabled: false upstreams: @@ -2107,56 +2104,25 @@ gateway: app: osmo-ui port: 8000 - ## ----------------------------------------------------------------------- - ## TLS — encrypted gateway-to-upstream communication. - ## - ## When enabled: - ## - Cert material is generated (see two modes below) - ## - Envoy validates upstreams against the CA - ## - Upstream Deployments mount their per-service Secret and start uvicorn - ## with --ssl_keyfile / --ssl_certfile so they serve HTTPS on :8000 - ## - Probes use scheme: HTTPS - ## - ## Two modes for cert material: - ## - ## A1 (default, certManager.enabled=false): - ## Helm self-signs a CA + per-service leaf certs using genCA / - ## genSignedCert with `lookup` so existing certs are reused across - ## `helm upgrade`. Zero external dependencies. - ## - ## A2 (certManager.enabled=true): - ## Emit cert-manager Issuer + Certificate resources. Provide your own - ## issuerRef (e.g. a Vault ClusterIssuer) or omit it to let the chart - ## create a self-signed Issuer + CA chain. Production-grade rotation. - ## - ## The UI is intentionally NOT served over TLS — it stays HTTP behind - ## NetworkPolicy. Only osmo-service, osmo-router, osmo-agent, and - ## osmo-logger get TLS server certs. - ## ----------------------------------------------------------------------- + ## TLS for gateway -> upstream traffic. Default: each service mints an + ## ephemeral self-signed cert in-process; Envoy uses common_tls_context: {} + ## (encrypt, no validation). Set certManager.enabled to use cert-manager + ## for validated TLS instead. UI is excluded — stays HTTP. tls: - enabled: false - - ## CA cert validity (cert-manager mode). 10 years by default — long - ## enough that rotation is a deliberate operation. - caDuration: 87600h - caRenewBefore: 720h + enabled: true - ## Leaf-cert validity (cert-manager mode). 5 years by default. - certDuration: 43800h - certRenewBefore: 360h + ## cert-manager mode validity periods. + caDuration: 87600h # 10y + caRenewBefore: 720h # 30d + certDuration: 43800h # 5y + certRenewBefore: 360h # 15d certManager: - ## Use cert-manager (A2) instead of Helm-generated certs (A1). + ## Switch from in-process self-signed to cert-manager-managed CA + leafs. ## Requires cert-manager installed in the cluster. enabled: false - ## Optional: point at an existing Issuer/ClusterIssuer instead of - ## letting the chart create a self-signed CA chain. Set name + kind - ## (Issuer or ClusterIssuer) + group. - ## - ## Example: - ## issuerRef: - ## name: vault-issuer - ## kind: ClusterIssuer - ## group: cert-manager.io + ## Optional: existing Issuer/ClusterIssuer to use instead of the + ## chart's self-signed CA. Example: + ## issuerRef: {name: vault-issuer, kind: ClusterIssuer, group: cert-manager.io} issuerRef: {} diff --git a/src/utils/static_config.py b/src/utils/static_config.py index 43bee357e..9c72a39ad 100644 --- a/src/utils/static_config.py +++ b/src/utils/static_config.py @@ -17,11 +17,19 @@ """ import argparse +import datetime +import ipaddress import os +import socket import sys +import tempfile import typing from typing import Any, ClassVar, Dict, Optional +from cryptography import x509 +from cryptography.hazmat.primitives import hashes, serialization +from cryptography.hazmat.primitives.asymmetric import ec +from cryptography.x509.oid import NameOID import pydantic from pydantic.fields import FieldInfo import yaml @@ -38,9 +46,21 @@ def _get_field_extras(field: FieldInfo) -> Dict[str, Any]: class SSLConfig(pydantic.BaseModel): """TLS/SSL configuration for the uvicorn listener. - Mixed into every service config so a single set of --ssl_keyfile / --ssl_certfile - flags works uniformly. When both files are set, uvicorn serves HTTPS; otherwise - HTTP. Used by the gateway -> upstream TLS path (gateway.tls.enabled in the chart). + Two modes, picked by which flags are set: + + 1. ssl_self_signed=True + The process mints a fresh ECDSA P-256 cert at startup, writes it to a + temp dir, and points uvicorn at it. The cert is per-process and lives + only as long as the container. Used by the chart's default + gateway.tls.enabled mode where Envoy connects with TLS but does not + validate the upstream cert (common_tls_context: {}). No CA management, + no Secret rotation, no init container needed. + + 2. ssl_keyfile and ssl_certfile point at on-disk PEMs (e.g. mounted from + a cert-manager-managed Secret). The process serves HTTPS using the + provided cert. Used when the chart is in cert-manager mode. + + With neither set, the listener serves plain HTTP. """ ssl_keyfile: Optional[str] = pydantic.Field( default=None, @@ -52,14 +72,71 @@ class SSLConfig(pydantic.BaseModel): description='Path to a PEM-encoded certificate (server leaf, optionally ' 'chained). Required together with ssl_keyfile.', json_schema_extra={'command_line': 'ssl_certfile', 'env': 'OSMO_SSL_CERTFILE'}) + ssl_self_signed: bool = pydantic.Field( + default=False, + description='Generate an ephemeral self-signed cert in-process and ' + 'serve HTTPS with it. The cert is regenerated on every ' + 'process start. Useful when the consumer (e.g. the OSMO ' + 'gateway) wants encryption-without-validation.', + json_schema_extra={'command_line': 'ssl_self_signed', + 'env': 'OSMO_SSL_SELF_SIGNED'}) def uvicorn_ssl_kwargs(self) -> Dict[str, Any]: """Return uvicorn keyword args for TLS, or an empty dict if TLS is off.""" + if self.ssl_self_signed: + keyfile, certfile = _mint_ephemeral_self_signed() + return {'ssl_keyfile': keyfile, 'ssl_certfile': certfile} if self.ssl_keyfile and self.ssl_certfile: return {'ssl_keyfile': self.ssl_keyfile, 'ssl_certfile': self.ssl_certfile} return {} +def _mint_ephemeral_self_signed() -> tuple[str, str]: + """Generate an ECDSA P-256 self-signed cert and write it to a temp dir. + + Returns (keyfile_path, certfile_path). uvicorn opens both at startup and + parses them into an in-memory SSLContext, so the files only need to exist + long enough for uvicorn's ssl.SSLContext.load_cert_chain() call. We don't + bother deleting them because the temp dir goes away when the container + exits. + + SANs include the pod hostname so anything that DOES validate (e.g. a + cluster-internal probe with HTTPS scheme) gets a name match. Envoy with + common_tls_context: {} ignores SANs entirely. + """ + private_key = ec.generate_private_key(ec.SECP256R1()) + hostname = socket.gethostname() or 'localhost' + subject = issuer = x509.Name([x509.NameAttribute(NameOID.COMMON_NAME, hostname)]) + + san_entries: list[x509.GeneralName] = [x509.DNSName(hostname), x509.DNSName('localhost')] + san_entries.append(x509.IPAddress(ipaddress.ip_address('127.0.0.1'))) + + now = datetime.datetime.now(datetime.timezone.utc) + cert = (x509.CertificateBuilder() + .subject_name(subject) + .issuer_name(issuer) + .public_key(private_key.public_key()) + .serial_number(x509.random_serial_number()) + .not_valid_before(now - datetime.timedelta(minutes=5)) + .not_valid_after(now + datetime.timedelta(days=365)) + .add_extension(x509.SubjectAlternativeName(san_entries), critical=False) + .add_extension(x509.BasicConstraints(ca=False, path_length=None), critical=True) + .sign(private_key, hashes.SHA256())) + + tmpdir = tempfile.mkdtemp(prefix='osmo-tls-') + keyfile_path = os.path.join(tmpdir, 'tls.key') + certfile_path = os.path.join(tmpdir, 'tls.crt') + with open(keyfile_path, 'wb') as f: + f.write(private_key.private_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PrivateFormat.PKCS8, + encryption_algorithm=serialization.NoEncryption())) + os.chmod(keyfile_path, 0o600) + with open(certfile_path, 'wb') as f: + f.write(cert.public_bytes(serialization.Encoding.PEM)) + return keyfile_path, certfile_path + + class StaticConfig(pydantic.BaseModel): """ A class for reading in config information from either command line, files, or environment variables """ From d118c3b5a6495fe3f1e789d81322afb0c0179114 Mon Sep 17 00:00:00 2001 From: Ryan Li Date: Tue, 5 May 2026 14:12:35 -0700 Subject: [PATCH 3/9] create separate file --- src/service/agent/BUILD | 1 + src/service/agent/agent_service.py | 4 +- src/service/core/workflow/BUILD | 1 + src/service/core/workflow/objects.py | 4 +- src/service/logger/BUILD | 1 + src/service/logger/logger.py | 4 +- src/service/router/BUILD | 1 + src/service/router/router.py | 4 +- src/utils/BUILD | 12 +++ src/utils/ssl_config.py | 131 +++++++++++++++++++++++++++ src/utils/static_config.py | 102 --------------------- 11 files changed, 155 insertions(+), 110 deletions(-) create mode 100644 src/utils/ssl_config.py diff --git a/src/service/agent/BUILD b/src/service/agent/BUILD index 4c946608b..0f57c6753 100644 --- a/src/service/agent/BUILD +++ b/src/service/agent/BUILD @@ -32,6 +32,7 @@ osmo_py_library( requirement("fastapi"), requirement("uvicorn"), "//src/utils:backend_messages", + "//src/utils:ssl_config", "//src/utils/metrics", "//src/utils/progress_check:progress_check_lib", "//src/service/core/auth", diff --git a/src/service/agent/agent_service.py b/src/service/agent/agent_service.py index a4f0e5465..2b34efa8c 100644 --- a/src/service/agent/agent_service.py +++ b/src/service/agent/agent_service.py @@ -32,7 +32,7 @@ from src.service.core.config import configmap_loader from src.service.core.config.configmap_loader import ConfigFileMixin from src.service.core.workflow import objects -from src.utils import connectors, static_config +from src.utils import connectors, ssl_config, static_config from src.utils.progress_check import progress @@ -43,7 +43,7 @@ class BackendServiceConfig(connectors.RedisConfig, connectors.PostgresConfig, src.lib.utils.logging.LoggingConfig, static_config.StaticConfig, - static_config.SSLConfig, ConfigFileMixin): + ssl_config.SSLConfig, ConfigFileMixin): """Config settings for the backend service""" progress_period: int = pydantic.Field( default=30, diff --git a/src/service/core/workflow/BUILD b/src/service/core/workflow/BUILD index 7649e8e82..d5116ae42 100644 --- a/src/service/core/workflow/BUILD +++ b/src/service/core/workflow/BUILD @@ -41,6 +41,7 @@ osmo_py_library( "//src/lib/utils:osmo_errors", "//src/lib/utils:redact", "//src/service/core/config:configmap_loader_lib", + "//src/utils:ssl_config", "//src/utils:static_config", "//src/utils/job:job", "//src/utils:yaml", diff --git a/src/service/core/workflow/objects.py b/src/service/core/workflow/objects.py index ab5838040..366637d10 100644 --- a/src/service/core/workflow/objects.py +++ b/src/service/core/workflow/objects.py @@ -33,14 +33,14 @@ from src.service.core.config.configmap_loader import ConfigFileMixin from src.utils.job import app, common as task_common, jobs, kb_objects, task, workflow from src.utils.job.task import _encode_hstore -from src.utils import connectors, static_config, yaml as util_yaml +from src.utils import connectors, ssl_config, static_config, yaml as util_yaml from src.utils.metrics import metrics class WorkflowServiceConfig(connectors.RedisConfig, connectors.PostgresConfig, src.lib.utils.logging.LoggingConfig, static_config.StaticConfig, - static_config.SSLConfig, + ssl_config.SSLConfig, metrics.MetricsCreatorConfig, ConfigFileMixin): """ Manages configuration specific to the workflow service. """ diff --git a/src/service/logger/BUILD b/src/service/logger/BUILD index 60b5c5fde..c4b78a88f 100644 --- a/src/service/logger/BUILD +++ b/src/service/logger/BUILD @@ -49,6 +49,7 @@ osmo_py_library( "//src/service/core/auth", "//src/service/core/config:configmap_loader_lib", "//src/service/core/workflow", + "//src/utils:ssl_config", "//src/utils:static_config", "//src/utils/job", ], diff --git a/src/service/logger/logger.py b/src/service/logger/logger.py index cea943da2..05937ceaf 100644 --- a/src/service/logger/logger.py +++ b/src/service/logger/logger.py @@ -28,14 +28,14 @@ from src.service.core.auth import auth_service from src.service.core.config import configmap_loader from src.service.core.config.configmap_loader import ConfigFileMixin -from src.utils import connectors, static_config +from src.utils import connectors, ssl_config, static_config from src.utils.progress_check import progress class LoggerServiceConfig(connectors.RedisConfig, connectors.PostgresConfig, src.lib.utils.logging.LoggingConfig, static_config.StaticConfig, - static_config.SSLConfig, ConfigFileMixin): + ssl_config.SSLConfig, ConfigFileMixin): """Config settings for the logger service""" host: str = pydantic.Field( default='http://0.0.0.0:8000', diff --git a/src/service/router/BUILD b/src/service/router/BUILD index 10e6ca825..a99a6d8fe 100644 --- a/src/service/router/BUILD +++ b/src/service/router/BUILD @@ -36,6 +36,7 @@ osmo_py_library( "//src/lib/utils:common", "//src/lib/utils:logging", "//src/lib/utils:version", + "//src/utils:ssl_config", "//src/utils:static_config", ], visibility = ["//visibility:public"], diff --git a/src/service/router/router.py b/src/service/router/router.py index 0e46a4991..520d10974 100644 --- a/src/service/router/router.py +++ b/src/service/router/router.py @@ -32,11 +32,11 @@ from src.lib.utils import common, version import src.lib.utils.logging from src.service.router import helper -from src.utils import connectors, static_config +from src.utils import connectors, ssl_config, static_config class RouterServiceConfig(src.lib.utils.logging.LoggingConfig, static_config.StaticConfig, - static_config.SSLConfig, connectors.PostgresConfig): + ssl_config.SSLConfig, connectors.PostgresConfig): """Config settings for the logger service""" host: str = pydantic.Field( default='http://0.0.0.0:8000', diff --git a/src/utils/BUILD b/src/utils/BUILD index 5666af24c..a028134a9 100644 --- a/src/utils/BUILD +++ b/src/utils/BUILD @@ -50,6 +50,18 @@ osmo_py_library( visibility = ["//visibility:public"], ) +osmo_py_library( + name = "ssl_config", + srcs = [ + "ssl_config.py", + ], + deps = [ + requirement("cryptography"), + requirement("pydantic"), + ], + visibility = ["//visibility:public"], +) + osmo_py_library( name = "auth", srcs = ["auth.py"], diff --git a/src/utils/ssl_config.py b/src/utils/ssl_config.py new file mode 100644 index 000000000..c9a5eaa03 --- /dev/null +++ b/src/utils/ssl_config.py @@ -0,0 +1,131 @@ +""" +SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # pylint: disable=line-too-long + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 + +TLS/SSL configuration for services that listen on a uvicorn port. Lives in +its own module (rather than next to StaticConfig) because it pulls in the +`cryptography` package, and small utility binaries that only need +StaticConfig (e.g. progress_check) shouldn't have to bundle that dep. +""" + +import datetime +import ipaddress +import os +import socket +import tempfile +from typing import Any, Dict, Optional, Tuple + +from cryptography import x509 +from cryptography.hazmat.primitives import hashes, serialization +from cryptography.hazmat.primitives.asymmetric import ec +from cryptography.x509.oid import NameOID +import pydantic + + +class SSLConfig(pydantic.BaseModel): + """TLS/SSL configuration for the uvicorn listener. + + Three modes, picked by which flags are set: + + 1. ssl_self_signed=True + The process mints a fresh ECDSA P-256 cert at startup, writes it to a + temp dir, and points uvicorn at it. The cert is per-process and lives + only as long as the container. Used by the chart's default + gateway.tls.enabled mode where Envoy connects with TLS but does not + validate the upstream cert (common_tls_context: {}). No CA management, + no Secret rotation, no init container needed. + + 2. ssl_keyfile and ssl_certfile point at on-disk PEMs (e.g. mounted from + a cert-manager-managed Secret). The process serves HTTPS using the + provided cert. Used when the chart is in cert-manager mode. + + 3. None set: plain HTTP. + """ + ssl_keyfile: Optional[str] = pydantic.Field( + default=None, + description='Path to a PEM-encoded private key. If set together with ' + 'ssl_certfile, the service serves HTTPS instead of HTTP.', + json_schema_extra={'command_line': 'ssl_keyfile', 'env': 'OSMO_SSL_KEYFILE'}) + ssl_certfile: Optional[str] = pydantic.Field( + default=None, + description='Path to a PEM-encoded certificate (server leaf, optionally ' + 'chained). Required together with ssl_keyfile.', + json_schema_extra={'command_line': 'ssl_certfile', 'env': 'OSMO_SSL_CERTFILE'}) + ssl_self_signed: bool = pydantic.Field( + default=False, + description='Generate an ephemeral self-signed cert in-process and ' + 'serve HTTPS with it. The cert is regenerated on every ' + 'process start. Useful when the consumer (e.g. the OSMO ' + 'gateway) wants encryption-without-validation.', + json_schema_extra={'command_line': 'ssl_self_signed', + 'env': 'OSMO_SSL_SELF_SIGNED'}) + + def uvicorn_ssl_kwargs(self) -> Dict[str, Any]: + """Return uvicorn keyword args for TLS, or an empty dict if TLS is off.""" + if self.ssl_self_signed: + keyfile, certfile = _mint_ephemeral_self_signed() + return {'ssl_keyfile': keyfile, 'ssl_certfile': certfile} + if self.ssl_keyfile and self.ssl_certfile: + return {'ssl_keyfile': self.ssl_keyfile, 'ssl_certfile': self.ssl_certfile} + return {} + + +def _mint_ephemeral_self_signed() -> Tuple[str, str]: + """Generate an ECDSA P-256 self-signed cert and write it to a temp dir. + + Returns (keyfile_path, certfile_path). uvicorn opens both at startup and + parses them into an in-memory SSLContext, so the files only need to exist + long enough for ssl.SSLContext.load_cert_chain() to read them. We don't + bother deleting them; the temp dir goes away when the container exits. + + SANs include the pod hostname so anything that DOES validate (e.g. a + cluster-internal probe with HTTPS scheme) gets a name match. Envoy with + common_tls_context: {} ignores SANs entirely. + """ + private_key = ec.generate_private_key(ec.SECP256R1()) + hostname = socket.gethostname() or 'localhost' + subject = issuer = x509.Name([x509.NameAttribute(NameOID.COMMON_NAME, hostname)]) + + san_entries: list[x509.GeneralName] = [ + x509.DNSName(hostname), + x509.DNSName('localhost'), + x509.IPAddress(ipaddress.ip_address('127.0.0.1')), + ] + + now = datetime.datetime.now(datetime.timezone.utc) + cert = (x509.CertificateBuilder() + .subject_name(subject) + .issuer_name(issuer) + .public_key(private_key.public_key()) + .serial_number(x509.random_serial_number()) + .not_valid_before(now - datetime.timedelta(minutes=5)) + .not_valid_after(now + datetime.timedelta(days=365)) + .add_extension(x509.SubjectAlternativeName(san_entries), critical=False) + .add_extension(x509.BasicConstraints(ca=False, path_length=None), critical=True) + .sign(private_key, hashes.SHA256())) + + tmpdir = tempfile.mkdtemp(prefix='osmo-tls-') + keyfile_path = os.path.join(tmpdir, 'tls.key') + certfile_path = os.path.join(tmpdir, 'tls.crt') + with open(keyfile_path, 'wb') as f: + f.write(private_key.private_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PrivateFormat.PKCS8, + encryption_algorithm=serialization.NoEncryption())) + os.chmod(keyfile_path, 0o600) + with open(certfile_path, 'wb') as f: + f.write(cert.public_bytes(serialization.Encoding.PEM)) + return keyfile_path, certfile_path diff --git a/src/utils/static_config.py b/src/utils/static_config.py index 9c72a39ad..fe226efdc 100644 --- a/src/utils/static_config.py +++ b/src/utils/static_config.py @@ -17,19 +17,11 @@ """ import argparse -import datetime -import ipaddress import os -import socket import sys -import tempfile import typing from typing import Any, ClassVar, Dict, Optional -from cryptography import x509 -from cryptography.hazmat.primitives import hashes, serialization -from cryptography.hazmat.primitives.asymmetric import ec -from cryptography.x509.oid import NameOID import pydantic from pydantic.fields import FieldInfo import yaml @@ -43,100 +35,6 @@ def _get_field_extras(field: FieldInfo) -> Dict[str, Any]: return {} -class SSLConfig(pydantic.BaseModel): - """TLS/SSL configuration for the uvicorn listener. - - Two modes, picked by which flags are set: - - 1. ssl_self_signed=True - The process mints a fresh ECDSA P-256 cert at startup, writes it to a - temp dir, and points uvicorn at it. The cert is per-process and lives - only as long as the container. Used by the chart's default - gateway.tls.enabled mode where Envoy connects with TLS but does not - validate the upstream cert (common_tls_context: {}). No CA management, - no Secret rotation, no init container needed. - - 2. ssl_keyfile and ssl_certfile point at on-disk PEMs (e.g. mounted from - a cert-manager-managed Secret). The process serves HTTPS using the - provided cert. Used when the chart is in cert-manager mode. - - With neither set, the listener serves plain HTTP. - """ - ssl_keyfile: Optional[str] = pydantic.Field( - default=None, - description='Path to a PEM-encoded private key. If set together with ' - 'ssl_certfile, the service serves HTTPS instead of HTTP.', - json_schema_extra={'command_line': 'ssl_keyfile', 'env': 'OSMO_SSL_KEYFILE'}) - ssl_certfile: Optional[str] = pydantic.Field( - default=None, - description='Path to a PEM-encoded certificate (server leaf, optionally ' - 'chained). Required together with ssl_keyfile.', - json_schema_extra={'command_line': 'ssl_certfile', 'env': 'OSMO_SSL_CERTFILE'}) - ssl_self_signed: bool = pydantic.Field( - default=False, - description='Generate an ephemeral self-signed cert in-process and ' - 'serve HTTPS with it. The cert is regenerated on every ' - 'process start. Useful when the consumer (e.g. the OSMO ' - 'gateway) wants encryption-without-validation.', - json_schema_extra={'command_line': 'ssl_self_signed', - 'env': 'OSMO_SSL_SELF_SIGNED'}) - - def uvicorn_ssl_kwargs(self) -> Dict[str, Any]: - """Return uvicorn keyword args for TLS, or an empty dict if TLS is off.""" - if self.ssl_self_signed: - keyfile, certfile = _mint_ephemeral_self_signed() - return {'ssl_keyfile': keyfile, 'ssl_certfile': certfile} - if self.ssl_keyfile and self.ssl_certfile: - return {'ssl_keyfile': self.ssl_keyfile, 'ssl_certfile': self.ssl_certfile} - return {} - - -def _mint_ephemeral_self_signed() -> tuple[str, str]: - """Generate an ECDSA P-256 self-signed cert and write it to a temp dir. - - Returns (keyfile_path, certfile_path). uvicorn opens both at startup and - parses them into an in-memory SSLContext, so the files only need to exist - long enough for uvicorn's ssl.SSLContext.load_cert_chain() call. We don't - bother deleting them because the temp dir goes away when the container - exits. - - SANs include the pod hostname so anything that DOES validate (e.g. a - cluster-internal probe with HTTPS scheme) gets a name match. Envoy with - common_tls_context: {} ignores SANs entirely. - """ - private_key = ec.generate_private_key(ec.SECP256R1()) - hostname = socket.gethostname() or 'localhost' - subject = issuer = x509.Name([x509.NameAttribute(NameOID.COMMON_NAME, hostname)]) - - san_entries: list[x509.GeneralName] = [x509.DNSName(hostname), x509.DNSName('localhost')] - san_entries.append(x509.IPAddress(ipaddress.ip_address('127.0.0.1'))) - - now = datetime.datetime.now(datetime.timezone.utc) - cert = (x509.CertificateBuilder() - .subject_name(subject) - .issuer_name(issuer) - .public_key(private_key.public_key()) - .serial_number(x509.random_serial_number()) - .not_valid_before(now - datetime.timedelta(minutes=5)) - .not_valid_after(now + datetime.timedelta(days=365)) - .add_extension(x509.SubjectAlternativeName(san_entries), critical=False) - .add_extension(x509.BasicConstraints(ca=False, path_length=None), critical=True) - .sign(private_key, hashes.SHA256())) - - tmpdir = tempfile.mkdtemp(prefix='osmo-tls-') - keyfile_path = os.path.join(tmpdir, 'tls.key') - certfile_path = os.path.join(tmpdir, 'tls.crt') - with open(keyfile_path, 'wb') as f: - f.write(private_key.private_bytes( - encoding=serialization.Encoding.PEM, - format=serialization.PrivateFormat.PKCS8, - encryption_algorithm=serialization.NoEncryption())) - os.chmod(keyfile_path, 0o600) - with open(certfile_path, 'wb') as f: - f.write(cert.public_bytes(serialization.Encoding.PEM)) - return keyfile_path, certfile_path - - class StaticConfig(pydantic.BaseModel): """ A class for reading in config information from either command line, files, or environment variables """ From 494df9b5d996cd588e222f542479ac645cb3cd1c Mon Sep 17 00:00:00 2001 From: Ryan Li Date: Tue, 5 May 2026 14:42:00 -0700 Subject: [PATCH 4/9] remove --- .../templates/_gateway-envoy-config.tpl | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/deployments/charts/service/templates/_gateway-envoy-config.tpl b/deployments/charts/service/templates/_gateway-envoy-config.tpl index d882cb1de..b92cfca37 100644 --- a/deployments/charts/service/templates/_gateway-envoy-config.tpl +++ b/deployments/charts/service/templates/_gateway-envoy-config.tpl @@ -588,9 +588,12 @@ data: path: /var/config/sds_upstream_ca.yaml watched_directory: path: /var/config - {{- else }} - common_tls_context: {} {{- end }} + {{/* Default mode: omit common_tls_context entirely. Setting it + to {} causes Envoy to silently degrade the transport socket + to raw TCP (ssl.handshake stays at 0), so plaintext HTTP + bytes hit uvicorn's TLS listener and the upstream RSTs the + connection. Mirrors what the working `idp` cluster does. */}} {{- end }} {{- if $gw.upstreams.router.enabled }} @@ -626,9 +629,12 @@ data: path: /var/config/sds_upstream_ca.yaml watched_directory: path: /var/config - {{- else }} - common_tls_context: {} {{- end }} + {{/* Default mode: omit common_tls_context entirely. Setting it + to {} causes Envoy to silently degrade the transport socket + to raw TCP (ssl.handshake stays at 0), so plaintext HTTP + bytes hit uvicorn's TLS listener and the upstream RSTs the + connection. Mirrors what the working `idp` cluster does. */}} {{- end }} {{- end }} @@ -687,9 +693,12 @@ data: path: /var/config/sds_upstream_ca.yaml watched_directory: path: /var/config - {{- else }} - common_tls_context: {} {{- end }} + {{/* Default mode: omit common_tls_context entirely. Setting it + to {} causes Envoy to silently degrade the transport socket + to raw TCP (ssl.handshake stays at 0), so plaintext HTTP + bytes hit uvicorn's TLS listener and the upstream RSTs the + connection. Mirrors what the working `idp` cluster does. */}} {{- end }} {{- end }} @@ -724,9 +733,12 @@ data: path: /var/config/sds_upstream_ca.yaml watched_directory: path: /var/config - {{- else }} - common_tls_context: {} {{- end }} + {{/* Default mode: omit common_tls_context entirely. Setting it + to {} causes Envoy to silently degrade the transport socket + to raw TCP (ssl.handshake stays at 0), so plaintext HTTP + bytes hit uvicorn's TLS listener and the upstream RSTs the + connection. Mirrors what the working `idp` cluster does. */}} {{- end }} {{- end }} @@ -848,9 +860,12 @@ data: path: /var/config/sds_upstream_ca.yaml watched_directory: path: /var/config - {{- else }} - common_tls_context: {} {{- end }} + {{/* Default mode: omit common_tls_context entirely. Setting it + to {} causes Envoy to silently degrade the transport socket + to raw TCP (ssl.handshake stays at 0), so plaintext HTTP + bytes hit uvicorn's TLS listener and the upstream RSTs the + connection. Mirrors what the working `idp` cluster does. */}} {{- end }} {{- end }} From e88dae692b1d694dfd353fc9969fae6652a252b5 Mon Sep 17 00:00:00 2001 From: Ryan Li Date: Tue, 5 May 2026 14:51:47 -0700 Subject: [PATCH 5/9] revert --- .../templates/_gateway-envoy-config.tpl | 80 +++++++++++-------- 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/deployments/charts/service/templates/_gateway-envoy-config.tpl b/deployments/charts/service/templates/_gateway-envoy-config.tpl index b92cfca37..e7a708ba7 100644 --- a/deployments/charts/service/templates/_gateway-envoy-config.tpl +++ b/deployments/charts/service/templates/_gateway-envoy-config.tpl @@ -579,8 +579,15 @@ data: typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext sni: {{ $gw.upstreams.service.host }} - {{- if $gw.tls.certManager.enabled }} common_tls_context: + {{/* Envoy 1.29 upstream defaults to TLS 1.2 max. uvicorn's + SSLContext uses Python defaults (TLS 1.2 floor, 1.3 if + the openssl version supports it). Allow up to 1.3 so + negotiation can pick the most compatible option. */}} + tls_params: + tls_minimum_protocol_version: TLSv1_2 + tls_maximum_protocol_version: TLSv1_3 + {{- if $gw.tls.certManager.enabled }} validation_context_sds_secret_config: name: upstream_ca sds_config: @@ -588,12 +595,7 @@ data: path: /var/config/sds_upstream_ca.yaml watched_directory: path: /var/config - {{- end }} - {{/* Default mode: omit common_tls_context entirely. Setting it - to {} causes Envoy to silently degrade the transport socket - to raw TCP (ssl.handshake stays at 0), so plaintext HTTP - bytes hit uvicorn's TLS listener and the upstream RSTs the - connection. Mirrors what the working `idp` cluster does. */}} + {{- end }} {{- end }} {{- if $gw.upstreams.router.enabled }} @@ -620,8 +622,15 @@ data: typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext sni: {{ $gw.upstreams.router.host }} - {{- if $gw.tls.certManager.enabled }} common_tls_context: + {{/* Envoy 1.29 upstream defaults to TLS 1.2 max. uvicorn's + SSLContext uses Python defaults (TLS 1.2 floor, 1.3 if + the openssl version supports it). Allow up to 1.3 so + negotiation can pick the most compatible option. */}} + tls_params: + tls_minimum_protocol_version: TLSv1_2 + tls_maximum_protocol_version: TLSv1_3 + {{- if $gw.tls.certManager.enabled }} validation_context_sds_secret_config: name: upstream_ca sds_config: @@ -629,12 +638,7 @@ data: path: /var/config/sds_upstream_ca.yaml watched_directory: path: /var/config - {{- end }} - {{/* Default mode: omit common_tls_context entirely. Setting it - to {} causes Envoy to silently degrade the transport socket - to raw TCP (ssl.handshake stays at 0), so plaintext HTTP - bytes hit uvicorn's TLS listener and the upstream RSTs the - connection. Mirrors what the working `idp` cluster does. */}} + {{- end }} {{- end }} {{- end }} @@ -684,8 +688,15 @@ data: typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext sni: {{ $gw.upstreams.agent.host }} - {{- if $gw.tls.certManager.enabled }} common_tls_context: + {{/* Envoy 1.29 upstream defaults to TLS 1.2 max. uvicorn's + SSLContext uses Python defaults (TLS 1.2 floor, 1.3 if + the openssl version supports it). Allow up to 1.3 so + negotiation can pick the most compatible option. */}} + tls_params: + tls_minimum_protocol_version: TLSv1_2 + tls_maximum_protocol_version: TLSv1_3 + {{- if $gw.tls.certManager.enabled }} validation_context_sds_secret_config: name: upstream_ca sds_config: @@ -693,12 +704,7 @@ data: path: /var/config/sds_upstream_ca.yaml watched_directory: path: /var/config - {{- end }} - {{/* Default mode: omit common_tls_context entirely. Setting it - to {} causes Envoy to silently degrade the transport socket - to raw TCP (ssl.handshake stays at 0), so plaintext HTTP - bytes hit uvicorn's TLS listener and the upstream RSTs the - connection. Mirrors what the working `idp` cluster does. */}} + {{- end }} {{- end }} {{- end }} @@ -724,8 +730,15 @@ data: typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext sni: {{ $gw.upstreams.logger.host }} - {{- if $gw.tls.certManager.enabled }} common_tls_context: + {{/* Envoy 1.29 upstream defaults to TLS 1.2 max. uvicorn's + SSLContext uses Python defaults (TLS 1.2 floor, 1.3 if + the openssl version supports it). Allow up to 1.3 so + negotiation can pick the most compatible option. */}} + tls_params: + tls_minimum_protocol_version: TLSv1_2 + tls_maximum_protocol_version: TLSv1_3 + {{- if $gw.tls.certManager.enabled }} validation_context_sds_secret_config: name: upstream_ca sds_config: @@ -733,12 +746,7 @@ data: path: /var/config/sds_upstream_ca.yaml watched_directory: path: /var/config - {{- end }} - {{/* Default mode: omit common_tls_context entirely. Setting it - to {} causes Envoy to silently degrade the transport socket - to raw TCP (ssl.handshake stays at 0), so plaintext HTTP - bytes hit uvicorn's TLS listener and the upstream RSTs the - connection. Mirrors what the working `idp` cluster does. */}} + {{- end }} {{- end }} {{- end }} @@ -851,8 +859,15 @@ data: typed_config: "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext sni: {{ $jwksHost }} - {{- if $gw.tls.certManager.enabled }} common_tls_context: + {{/* Envoy 1.29 upstream defaults to TLS 1.2 max. uvicorn's + SSLContext uses Python defaults (TLS 1.2 floor, 1.3 if + the openssl version supports it). Allow up to 1.3 so + negotiation can pick the most compatible option. */}} + tls_params: + tls_minimum_protocol_version: TLSv1_2 + tls_maximum_protocol_version: TLSv1_3 + {{- if $gw.tls.certManager.enabled }} validation_context_sds_secret_config: name: upstream_ca sds_config: @@ -860,12 +875,7 @@ data: path: /var/config/sds_upstream_ca.yaml watched_directory: path: /var/config - {{- end }} - {{/* Default mode: omit common_tls_context entirely. Setting it - to {} causes Envoy to silently degrade the transport socket - to raw TCP (ssl.handshake stays at 0), so plaintext HTTP - bytes hit uvicorn's TLS listener and the upstream RSTs the - connection. Mirrors what the working `idp` cluster does. */}} + {{- end }} {{- end }} {{- end }} From 76a4fb1e5cea8a868e333ce1d8bf5d17a7d06c16 Mon Sep 17 00:00:00 2001 From: Ryan Li Date: Tue, 5 May 2026 15:27:11 -0700 Subject: [PATCH 6/9] Update tests and docs --- .../getting_started/deploy_service.rst | 27 +++- src/utils/ssl_config.py | 36 ++++- src/utils/tests/BUILD | 12 ++ src/utils/tests/test_ssl_config.py | 150 ++++++++++++++++++ 4 files changed, 222 insertions(+), 3 deletions(-) create mode 100644 src/utils/tests/test_ssl_config.py diff --git a/docs/deployment_guide/getting_started/deploy_service.rst b/docs/deployment_guide/getting_started/deploy_service.rst index adb00f45f..15986cbce 100644 --- a/docs/deployment_guide/getting_started/deploy_service.rst +++ b/docs/deployment_guide/getting_started/deploy_service.rst @@ -288,7 +288,7 @@ Create ``osmo_values.yaml`` for the OSMO service with the following sample. :icon: file .. code-block:: yaml - :emphasize-lines: 4, 21-23, 34, 36, 42, 51, 54-59, 74, 148-149, 153-154, 160, 164, 178-180, 195-197 + :emphasize-lines: 4, 21-23, 34, 36, 42, 51, 54-59, 74, 148-149, 153-154, 160, 164, 178-180, 217-219 # Global configuration shared across all OSMO services global: @@ -475,10 +475,32 @@ Create ``osmo_values.yaml`` for the OSMO service with the following sample. # OSMO-issued JWTs (e.g. for access-token-based access) - issuer: osmo audience: osmo - jwks_uri: http://osmo-service/api/auth/keys + # https:// because the gateway -> upstream path is encrypted by + # default (gateway.tls.enabled). Use http:// only if you set + # gateway.tls.enabled: false. + jwks_uri: https://osmo-service/api/auth/keys user_claim: unique_name cluster: osmo-service-jwks + # Gateway -> upstream TLS. Enabled by default: each upstream service + # (osmo-service, osmo-router, osmo-agent, osmo-logger) mints an + # ephemeral self-signed cert in-process at startup, uvicorn serves + # HTTPS on :8000, and Envoy connects with TLS but skips cert validation + # (common_tls_context: {}). UI stays HTTP behind NetworkPolicy. + # + # To switch to validated TLS managed by cert-manager, set + # certManager.enabled=true and (optionally) point at an existing + # Issuer/ClusterIssuer via certManager.issuerRef. See the chart README + # for the full set of fields. + tls: + enabled: true + # certManager: + # enabled: true + # issuerRef: + # name: vault-issuer + # kind: ClusterIssuer + # group: cert-manager.io + # OAuth2 Proxy configuration # Set OIDC issuer URL and client ID from your IdP (e.g. Microsoft Entra ID, Google). See identity_provider_setup. oauth2Proxy: @@ -632,6 +654,7 @@ Troubleshooting * **Database connection failures**: Verify the database is running and accessible * **Authentication configuration issues**: Verify the authentication configuration is correct * **Gateway routing problems**: Verify the gateway pods are running and the ``osmo-gateway`` service has an external IP (``kubectl get svc osmo-gateway -n osmo``) + * **Repeated** ``Jwks async fetching ... failed`` **in the gateway logs**: the OSMO-issued-JWT provider's ``jwks_uri`` scheme must match ``gateway.tls.enabled`` (``https://`` when on, ``http://`` when off). Verify with the Envoy admin endpoint: ``cluster.osmo-service-jwks.ssl.handshake`` should grow alongside ``upstream_cx_total``; if it stays at ``0``, the upstream wasn't restarted to pick up its TLS config. * **Resource constraints**: Verify the resource limits are set correctly * **Missing secrets or incorrect configurations**: Verify the secrets are created correctly and the configurations are correct * **ConfigMap validation errors**: Pod in CrashLoopBackOff after a Helm upgrade — check ``kubectl describe configmap osmo-service-configs`` for the validation error diff --git a/src/utils/ssl_config.py b/src/utils/ssl_config.py index c9a5eaa03..c0272a656 100644 --- a/src/utils/ssl_config.py +++ b/src/utils/ssl_config.py @@ -73,8 +73,42 @@ class SSLConfig(pydantic.BaseModel): json_schema_extra={'command_line': 'ssl_self_signed', 'env': 'OSMO_SSL_SELF_SIGNED'}) + @pydantic.model_validator(mode='after') + def _validate_ssl_combination(self) -> 'SSLConfig': + """Reject incomplete or conflicting TLS settings at config-load time. + + Silently falling back to HTTP when one of these is misconfigured leads + to confusing failures later (Envoy talks TLS to a plain-HTTP listener, + clients hit unexpected redirects, etc.). Fail fast instead so the + operator sees the problem at startup. + """ + explicit_paths = bool(self.ssl_keyfile) or bool(self.ssl_certfile) + both_paths = bool(self.ssl_keyfile) and bool(self.ssl_certfile) + + # Incomplete: exactly one of keyfile/certfile. + if explicit_paths and not both_paths: + missing = 'ssl_certfile' if self.ssl_keyfile else 'ssl_keyfile' + raise ValueError( + f'TLS misconfigured: ssl_keyfile and ssl_certfile must be set ' + f'together; missing {missing}. Set both to enable TLS, or ' + f'unset both to serve plain HTTP.') + + # Conflicting: self-signed mode plus explicit on-disk paths. + if self.ssl_self_signed and explicit_paths: + raise ValueError( + 'TLS misconfigured: ssl_self_signed cannot be combined with ' + 'explicit ssl_keyfile/ssl_certfile. Pick one mode — set ' + 'ssl_self_signed=true to mint an ephemeral cert in-process, ' + 'or provide ssl_keyfile + ssl_certfile to use on-disk PEMs.') + + return self + def uvicorn_ssl_kwargs(self) -> Dict[str, Any]: - """Return uvicorn keyword args for TLS, or an empty dict if TLS is off.""" + """Return uvicorn keyword args for TLS, or an empty dict if TLS is off. + + The validator above guarantees we're in exactly one of three states: + all-unset (HTTP), self-signed-only, or both paths set. + """ if self.ssl_self_signed: keyfile, certfile = _mint_ephemeral_self_signed() return {'ssl_keyfile': keyfile, 'ssl_certfile': certfile} diff --git a/src/utils/tests/BUILD b/src/utils/tests/BUILD index a7e63113d..665d94db3 100644 --- a/src/utils/tests/BUILD +++ b/src/utils/tests/BUILD @@ -30,3 +30,15 @@ osmo_py_test( requirement("jwcrypto"), ] ) + +osmo_py_test( + name = "test_ssl_config", + srcs = [ + "test_ssl_config.py" + ], + deps = [ + "//src/utils:ssl_config", + requirement("cryptography"), + requirement("pydantic"), + ] +) diff --git a/src/utils/tests/test_ssl_config.py b/src/utils/tests/test_ssl_config.py new file mode 100644 index 000000000..e3684fc9f --- /dev/null +++ b/src/utils/tests/test_ssl_config.py @@ -0,0 +1,150 @@ +""" +SPDX-FileCopyrightText: NVIDIA CORPORATION +Copyright (c) 2026 NVIDIA CORPORATION. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +""" +import datetime +import os +import socket +import ssl +import stat +import unittest + +from cryptography import x509 +import pydantic + +from src.utils import ssl_config + + +class TestSSLConfigKwargs(unittest.TestCase): + """uvicorn_ssl_kwargs() picks the right mode for the given fields.""" + + def test_no_tls_returns_empty_kwargs(self): + cfg = ssl_config.SSLConfig() + self.assertEqual(cfg.uvicorn_ssl_kwargs(), {}) + + def test_explicit_paths_pass_through(self): + cfg = ssl_config.SSLConfig(ssl_keyfile='/etc/k.pem', ssl_certfile='/etc/c.pem') + self.assertEqual(cfg.uvicorn_ssl_kwargs(), + {'ssl_keyfile': '/etc/k.pem', 'ssl_certfile': '/etc/c.pem'}) + + def test_only_keyfile_set_raises(self): + # Half-configured TLS is the kind of bug that silently degrades a + # production listener to plain HTTP; the validator must fail loudly. + with self.assertRaisesRegex(pydantic.ValidationError, 'ssl_certfile'): + ssl_config.SSLConfig(ssl_keyfile='/etc/k.pem') + + def test_only_certfile_set_raises(self): + with self.assertRaisesRegex(pydantic.ValidationError, 'ssl_keyfile'): + ssl_config.SSLConfig(ssl_certfile='/etc/c.pem') + + def test_self_signed_with_explicit_paths_raises(self): + # Specifying both modes is ambiguous — pick one. Reject early so the + # operator notices instead of guessing which mode wins. + with self.assertRaisesRegex(pydantic.ValidationError, 'ssl_self_signed'): + ssl_config.SSLConfig(ssl_self_signed=True, + ssl_keyfile='/etc/k.pem', + ssl_certfile='/etc/c.pem') + + def test_self_signed_with_just_keyfile_raises(self): + # Conflict-detection should fire even when the on-disk pair is itself + # incomplete; otherwise the user gets two confusing errors instead of + # one pointing at the conflict. + with self.assertRaises(pydantic.ValidationError): + ssl_config.SSLConfig(ssl_self_signed=True, ssl_keyfile='/etc/k.pem') + + def test_self_signed_returns_real_paths(self): + cfg = ssl_config.SSLConfig(ssl_self_signed=True) + kwargs = cfg.uvicorn_ssl_kwargs() + self.assertIn('ssl_keyfile', kwargs) + self.assertIn('ssl_certfile', kwargs) + self.assertTrue(os.path.isfile(kwargs['ssl_keyfile'])) + self.assertTrue(os.path.isfile(kwargs['ssl_certfile'])) + + +class TestEphemeralSelfSigned(unittest.TestCase): + """_mint_ephemeral_self_signed produces a usable cert/key pair on disk.""" + + def setUp(self): + self.keyfile, self.certfile = ssl_config._mint_ephemeral_self_signed() + + def test_files_exist_and_are_nonempty(self): + self.assertTrue(os.path.isfile(self.keyfile)) + self.assertTrue(os.path.isfile(self.certfile)) + self.assertGreater(os.path.getsize(self.keyfile), 0) + self.assertGreater(os.path.getsize(self.certfile), 0) + + def test_cert_pem_parses_as_x509(self): + with open(self.certfile, 'rb') as f: + cert = x509.load_pem_x509_certificate(f.read()) + self.assertIsInstance(cert, x509.Certificate) + + def test_cert_has_expected_sans(self): + with open(self.certfile, 'rb') as f: + cert = x509.load_pem_x509_certificate(f.read()) + san_ext = cert.extensions.get_extension_for_class(x509.SubjectAlternativeName) + dns_names = san_ext.value.get_values_for_type(x509.DNSName) + ip_addrs = [str(ip) for ip in san_ext.value.get_values_for_type(x509.IPAddress)] + # Hostname (CN) and the standard loopback aliases must be present so + # local probes (cluster-internal HTTPS, sidecar self-checks) get a + # name match. Envoy with common_tls_context: {} ignores SANs anyway. + self.assertIn(socket.gethostname() or 'localhost', dns_names) + self.assertIn('localhost', dns_names) + self.assertIn('127.0.0.1', ip_addrs) + + def test_cert_is_not_a_ca(self): + with open(self.certfile, 'rb') as f: + cert = x509.load_pem_x509_certificate(f.read()) + bc = cert.extensions.get_extension_for_class(x509.BasicConstraints) + self.assertFalse(bc.value.ca) + + def test_cert_validity_window_includes_now(self): + with open(self.certfile, 'rb') as f: + cert = x509.load_pem_x509_certificate(f.read()) + now = datetime.datetime.now(datetime.timezone.utc) + self.assertLess(cert.not_valid_before_utc, now) + self.assertGreater(cert.not_valid_after_utc, now) + + def test_keyfile_is_not_world_readable(self): + # Private key must be 0600 (only owner can read). Any group/other + # access on a private key would be a regression. + mode = stat.S_IMODE(os.stat(self.keyfile).st_mode) + self.assertEqual(mode & 0o077, 0, + f'keyfile permissions {oct(mode)} grant access beyond owner') + + def test_cert_loads_into_uvicorn_style_ssl_context(self): + # This is the exact call uvicorn makes internally; if it fails here + # uvicorn would fail at startup. Doubles as an end-to-end sanity check. + ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + ctx.load_cert_chain(certfile=self.certfile, keyfile=self.keyfile) + + def test_each_call_produces_a_unique_cert(self): + # Ephemerality matters — every process start should mint a fresh cert + # rather than reusing one across pods. + keyfile2, certfile2 = ssl_config._mint_ephemeral_self_signed() + self.assertNotEqual(self.keyfile, keyfile2) + self.assertNotEqual(self.certfile, certfile2) + with open(self.certfile, 'rb') as f: + cert1 = x509.load_pem_x509_certificate(f.read()) + with open(certfile2, 'rb') as f: + cert2 = x509.load_pem_x509_certificate(f.read()) + self.assertNotEqual(cert1.serial_number, cert2.serial_number) + self.assertNotEqual(cert1.public_key().public_numbers(), + cert2.public_key().public_numbers()) + + +if __name__ == '__main__': + unittest.main() From 8811f7ca32052c8e7b93097ed2a3f20b77873689 Mon Sep 17 00:00:00 2001 From: Ryan Li Date: Tue, 5 May 2026 16:06:07 -0700 Subject: [PATCH 7/9] remove cert manager --- deployments/charts/service/README.md | 19 +-- .../templates/_gateway-envoy-config.tpl | 12 +- .../service/templates/_gateway-helpers.tpl | 30 ++--- .../service/templates/agent-service.yaml | 8 +- .../charts/service/templates/api-service.yaml | 8 +- .../charts/service/templates/gateway-tls.yaml | 126 ------------------ .../charts/service/templates/gateway.yaml | 6 +- .../service/templates/logger-service.yaml | 8 +- .../service/templates/router-service.yaml | 8 +- deployments/charts/service/values.yaml | 41 +++--- .../getting_started/deploy_service.rst | 24 ++-- 11 files changed, 81 insertions(+), 209 deletions(-) delete mode 100644 deployments/charts/service/templates/gateway-tls.yaml diff --git a/deployments/charts/service/README.md b/deployments/charts/service/README.md index 1e3898f69..dc5495b43 100644 --- a/deployments/charts/service/README.md +++ b/deployments/charts/service/README.md @@ -407,23 +407,18 @@ Envoy uses filesystem-based dynamic configuration (LDS/CDS). When the ConfigMap Traffic between the Envoy gateway and the upstream services (`osmo-service`, `osmo-router`, `osmo-agent`, `osmo-logger`) is encrypted by default. The UI intentionally stays on plain HTTP behind NetworkPolicy — Next.js does not natively serve TLS. -Two modes: +**Default — encryption without validation.** Each upstream service mints its own ephemeral self-signed cert in-process at startup (ECDSA P-256, ~1ms) and loads it into uvicorn's SSLContext via `--ssl_self_signed true`. Envoy connects with TLS but does *not* validate the cert. The wire is encrypted; identity verification is delegated to NetworkPolicy + Kubernetes RBAC. No CA management, no Secrets, no rotation — cert lifecycle is tied to process lifecycle. -**Default — encryption without validation.** Each upstream service mints its own ephemeral self-signed cert in-process at startup (ECDSA P-256, ~1ms), writes it to a temp dir, and loads it into uvicorn's SSLContext. The Python service does this via `--ssl_self_signed true` from the chart; the cert generation happens in `SSLConfig._mint_ephemeral_self_signed()` (`src/utils/static_config.py`). Envoy connects with TLS but configures `common_tls_context: {}` on the upstream cluster — it does *not* validate the cert. The wire is encrypted; identity verification is delegated to NetworkPolicy + Kubernetes RBAC. - -This means: no CA management, no Secrets to rotate, no ArgoCD churn, no init containers, no cross-pod cert dependency. Cert lifecycle is tied to process lifecycle — a pod restart mints a fresh cert. - -**Validated — cert-manager.** Set `gateway.tls.certManager.enabled: true`. The chart emits cert-manager `Issuer` + `Certificate` resources. By default it creates a self-signed root + a CA Issuer + per-service Certificates; the upstream Deployments mount the resulting Secrets read-only. To plug in an existing CA (Vault, internal PKI, ACME), set `gateway.tls.certManager.issuerRef`. Requires cert-manager installed in the cluster. +**Externally-provisioned certs.** Point `gateway.tls.upstreamCerts.` at an existing `kubernetes.io/tls` Secret containing `tls.crt` + `tls.key`. That Secret is mounted at `/etc/osmo/tls` and uvicorn loads it instead of self-signing. To make Envoy validate against a CA, set `gateway.tls.caSecret` to a Secret containing `ca.crt`. The chart does not create these Secrets — provision them however suits your environment (cert-manager, Vault CSI, sealed-secrets, manual `kubectl create secret tls`, etc.). The two knobs are independent: you can use external certs without validation, or validation alone (rarely useful), but typical "real" TLS sets both. | Parameter | Description | Default | |-----------|-------------|---------| | `gateway.tls.enabled` | Encrypt gateway → upstream traffic. | `true` | -| `gateway.tls.caDuration` | CA cert validity (cert-manager mode). | `87600h` (10y) | -| `gateway.tls.caRenewBefore` | Renew CA this long before expiry (cert-manager mode). | `720h` (30d) | -| `gateway.tls.certDuration` | Leaf cert validity (cert-manager mode). | `43800h` (5y) | -| `gateway.tls.certRenewBefore` | Renew leaf this long before expiry (cert-manager mode). | `360h` (15d) | -| `gateway.tls.certManager.enabled` | Switch from default mode to cert-manager-managed validated TLS. | `false` | -| `gateway.tls.certManager.issuerRef` | Optional: point at an existing Issuer/ClusterIssuer. Map with `name`, `kind` (`Issuer` or `ClusterIssuer`), and `group` (defaults to `cert-manager.io`). When empty, the chart creates a self-signed Issuer + CA chain. | `{}` | +| `gateway.tls.upstreamCerts.service` | Existing `kubernetes.io/tls` Secret for `osmo-service`. Empty string ⇒ self-signed. | `""` | +| `gateway.tls.upstreamCerts.router` | Same, for `osmo-router`. | `""` | +| `gateway.tls.upstreamCerts.agent` | Same, for `osmo-agent`. | `""` | +| `gateway.tls.upstreamCerts.logger` | Same, for `osmo-logger`. | `""` | +| `gateway.tls.caSecret` | Existing Secret containing `ca.crt`. When set, Envoy validates upstreams against this CA; when empty, TLS is encryption-only. | `""` | NetworkPolicy and TLS are independent: NetworkPolicy controls *who* can connect at L3/L4; TLS encrypts the bytes at L7. Run them together for defense in depth. diff --git a/deployments/charts/service/templates/_gateway-envoy-config.tpl b/deployments/charts/service/templates/_gateway-envoy-config.tpl index e7a708ba7..04449449f 100644 --- a/deployments/charts/service/templates/_gateway-envoy-config.tpl +++ b/deployments/charts/service/templates/_gateway-envoy-config.tpl @@ -70,7 +70,7 @@ data: filename: /etc/ssl/envoy-certs/tls.key {{- end }} - {{- if and $gw.tls.enabled $gw.tls.certManager.enabled }} + {{- if and $gw.tls.enabled $gw.tls.caSecret }} sds_upstream_ca.yaml: | resources: - "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.Secret @@ -587,7 +587,7 @@ data: tls_params: tls_minimum_protocol_version: TLSv1_2 tls_maximum_protocol_version: TLSv1_3 - {{- if $gw.tls.certManager.enabled }} + {{- if $gw.tls.caSecret }} validation_context_sds_secret_config: name: upstream_ca sds_config: @@ -630,7 +630,7 @@ data: tls_params: tls_minimum_protocol_version: TLSv1_2 tls_maximum_protocol_version: TLSv1_3 - {{- if $gw.tls.certManager.enabled }} + {{- if $gw.tls.caSecret }} validation_context_sds_secret_config: name: upstream_ca sds_config: @@ -696,7 +696,7 @@ data: tls_params: tls_minimum_protocol_version: TLSv1_2 tls_maximum_protocol_version: TLSv1_3 - {{- if $gw.tls.certManager.enabled }} + {{- if $gw.tls.caSecret }} validation_context_sds_secret_config: name: upstream_ca sds_config: @@ -738,7 +738,7 @@ data: tls_params: tls_minimum_protocol_version: TLSv1_2 tls_maximum_protocol_version: TLSv1_3 - {{- if $gw.tls.certManager.enabled }} + {{- if $gw.tls.caSecret }} validation_context_sds_secret_config: name: upstream_ca sds_config: @@ -867,7 +867,7 @@ data: tls_params: tls_minimum_protocol_version: TLSv1_2 tls_maximum_protocol_version: TLSv1_3 - {{- if $gw.tls.certManager.enabled }} + {{- if $gw.tls.caSecret }} validation_context_sds_secret_config: name: upstream_ca sds_config: diff --git a/deployments/charts/service/templates/_gateway-helpers.tpl b/deployments/charts/service/templates/_gateway-helpers.tpl index 9e424c0ae..49b255498 100644 --- a/deployments/charts/service/templates/_gateway-helpers.tpl +++ b/deployments/charts/service/templates/_gateway-helpers.tpl @@ -32,18 +32,16 @@ app.kubernetes.io/component: {{ .component }} {{- end }} {{/* -Per-upstream TLS args. Pass a dict with "context". +Per-upstream TLS args. Pass a dict with "context" and "secretName". -Default mode (no cert-manager): emits --ssl_self_signed true. The Python -service mints a fresh self-signed cert at process start and points uvicorn -at it. No init container, no volume, no Secret on the chart side. - -cert-manager mode: emits --ssl_keyfile / --ssl_certfile pointing at the -PEMs mounted from the cert-manager-managed Secret at /etc/osmo/tls. +When secretName is non-empty, that Secret is mounted at /etc/osmo/tls and +uvicorn loads tls.crt + tls.key from there (--ssl_keyfile / --ssl_certfile). +When empty, the Python service mints an ephemeral self-signed cert in +process at startup (--ssl_self_signed true) — no chart-side cert material. */}} {{- define "osmo.upstream-tls-args" -}} {{- if .context.Values.gateway.tls.enabled }} -{{- if .context.Values.gateway.tls.certManager.enabled }} +{{- if .secretName }} - --ssl_keyfile - /etc/osmo/tls/tls.key - --ssl_certfile @@ -56,11 +54,12 @@ PEMs mounted from the cert-manager-managed Secret at /etc/osmo/tls. {{- end }} {{/* -TLS volume mount for an upstream container. Only emitted in cert-manager -mode — default mode keeps cert material in an in-process tempdir. +TLS volume mount for an upstream container. Only emitted when a Secret +name is provided — self-signed mode keeps cert material in an in-process +tempdir, so no mount is needed. */}} {{- define "osmo.upstream-tls-volume-mount" -}} -{{- if and .context.Values.gateway.tls.enabled .context.Values.gateway.tls.certManager.enabled }} +{{- if and .context.Values.gateway.tls.enabled .secretName }} - name: tls mountPath: /etc/osmo/tls readOnly: true @@ -68,15 +67,14 @@ mode — default mode keeps cert material in an in-process tempdir. {{- end }} {{/* -TLS volume for an upstream pod. Pass dict with "context" and "secret" (the -per-service Secret name in cert-manager mode, e.g. "osmo-service-tls"). -Only emitted in cert-manager mode. +TLS volume for an upstream pod. Pass dict with "context" and "secretName". +Only emitted when secretName is non-empty. */}} {{- define "osmo.upstream-tls-volume" -}} -{{- if and .context.Values.gateway.tls.enabled .context.Values.gateway.tls.certManager.enabled }} +{{- if and .context.Values.gateway.tls.enabled .secretName }} - name: tls secret: - secretName: {{ .secret }} + secretName: {{ .secretName }} {{- end }} {{- end }} diff --git a/deployments/charts/service/templates/agent-service.yaml b/deployments/charts/service/templates/agent-service.yaml index 53c03a366..7e54f9424 100644 --- a/deployments/charts/service/templates/agent-service.yaml +++ b/deployments/charts/service/templates/agent-service.yaml @@ -130,7 +130,7 @@ spec: {{- range $arg := .Values.services.agent.extraArgs }} - {{ $arg | quote }} {{- end }} - {{- include "osmo.upstream-tls-args" (dict "context" .) | nindent 8 }} + {{- include "osmo.upstream-tls-args" (dict "context" . "secretName" .Values.gateway.tls.upstreamCerts.agent) | nindent 8 }} env: {{- if .Values.services.migration.enabled }} - name: OSMO_SCHEMA_VERSION @@ -155,7 +155,7 @@ spec: {{- end }} imagePullPolicy: {{ .Values.services.agent.imagePullPolicy }} ports: - {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.agent.extraVolumeMounts (and .Values.gateway.tls.enabled .Values.gateway.tls.certManager.enabled) }} + {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.agent.extraVolumeMounts (and .Values.gateway.tls.enabled .Values.gateway.tls.upstreamCerts.agent) }} volumeMounts: {{- end }} {{- if .Values.services.configFile.enabled}} @@ -169,7 +169,7 @@ spec: mountPath: /logs {{- end }} {{- include "osmo.extra-volume-mounts" .Values.services.agent | nindent 8 }} - {{- include "osmo.upstream-tls-volume-mount" (dict "context" .) | nindent 8 }} + {{- include "osmo.upstream-tls-volume-mount" (dict "context" . "secretName" .Values.gateway.tls.upstreamCerts.agent) | nindent 8 }} resources: {{- toYaml .Values.services.agent.resources | nindent 10 }} @@ -215,7 +215,7 @@ spec: {{- include "osmo.extra-sidecars" .Values.services.agent | nindent 6 }} volumes: {{- include "osmo.extra-volumes" .Values.services.agent | nindent 8 }} - {{- include "osmo.upstream-tls-volume" (dict "context" . "secret" "osmo-agent-tls") | nindent 8 }} + {{- include "osmo.upstream-tls-volume" (dict "context" . "secretName" .Values.gateway.tls.upstreamCerts.agent) | nindent 8 }} {{- if .Values.global.logs.enabled }} - name: logs emptyDir: {} diff --git a/deployments/charts/service/templates/api-service.yaml b/deployments/charts/service/templates/api-service.yaml index 20ae848fd..4e93c9e98 100644 --- a/deployments/charts/service/templates/api-service.yaml +++ b/deployments/charts/service/templates/api-service.yaml @@ -152,7 +152,7 @@ spec: {{- range $arg := .Values.services.service.extraArgs }} - {{ $arg | quote }} {{- end }} - {{- include "osmo.upstream-tls-args" (dict "context" .) | nindent 8 }} + {{- include "osmo.upstream-tls-args" (dict "context" . "secretName" .Values.gateway.tls.upstreamCerts.service) | nindent 8 }} env: - name: OSMO_DISABLE_TASK_METRICS value: {{ .Values.services.service.disableTaskMetrics | quote }} @@ -194,7 +194,7 @@ spec: ports: - name: metrics containerPort: 9464 - {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.service.extraVolumeMounts (and .Values.gateway.tls.enabled .Values.gateway.tls.certManager.enabled) }} + {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.service.extraVolumeMounts (and .Values.gateway.tls.enabled .Values.gateway.tls.upstreamCerts.service) }} volumeMounts: {{- end }} {{- if .Values.services.configFile.enabled}} @@ -208,7 +208,7 @@ spec: mountPath: /logs {{- end }} {{- include "osmo.extra-volume-mounts" .Values.services.service | nindent 8 }} - {{- include "osmo.upstream-tls-volume-mount" (dict "context" .) | nindent 8 }} + {{- include "osmo.upstream-tls-volume-mount" (dict "context" . "secretName" .Values.gateway.tls.upstreamCerts.service) | nindent 8 }} resources: {{- toYaml .Values.services.service.resources | nindent 10 }} @@ -248,7 +248,7 @@ spec: {{- include "osmo.extra-sidecars" .Values.services.service | nindent 6 }} volumes: {{- include "osmo.extra-volumes" .Values.services.service | nindent 8 }} - {{- include "osmo.upstream-tls-volume" (dict "context" . "secret" "osmo-service-tls") | nindent 8 }} + {{- include "osmo.upstream-tls-volume" (dict "context" . "secretName" .Values.gateway.tls.upstreamCerts.service) | nindent 8 }} {{- if .Values.global.logs.enabled }} - name: logs emptyDir: {} diff --git a/deployments/charts/service/templates/gateway-tls.yaml b/deployments/charts/service/templates/gateway-tls.yaml deleted file mode 100644 index a2cdf2b6b..000000000 --- a/deployments/charts/service/templates/gateway-tls.yaml +++ /dev/null @@ -1,126 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# SPDX-License-Identifier: Apache-2.0 - -{{/* -Gateway -> upstream TLS. - -Default mode (gateway.tls.enabled=true, certManager.enabled=false): - Each upstream pod runs an init container that generates an ephemeral - self-signed cert into an emptyDir. uvicorn serves HTTPS with that cert. - Envoy connects with TLS but does NOT validate (common_tls_context: {} on - the upstream cluster). The wire is encrypted; identity verification is - handled by NetworkPolicy + Kubernetes RBAC, not by cert pinning. - - This template emits nothing in default mode. The init containers and - emptyDir volumes are wired into each Deployment via helpers in - _gateway-helpers.tpl, and the Envoy CDS uses an empty - common_tls_context. Result: no Secrets, no Jobs, no CA, no rotation — - cert lifecycle is tied to pod lifecycle. - -cert-manager mode (gateway.tls.enabled=true, certManager.enabled=true): - cert-manager manages a CA and per-service leaf certs as proper Secrets. - Envoy validates upstreams against that CA. Use this when you need - validated/audited TLS or want to plug in an enterprise PKI. -*/}} - -{{- if .Values.gateway.tls.enabled }} -{{- if .Values.gateway.tls.certManager.enabled }} -{{- $gw := .Values.gateway }} -{{- $gwName := include "osmo.gateway-name" . }} -{{- $ns := .Release.Namespace }} - -{{- $upstreams := list - (dict "name" "osmo-service" "secret" "osmo-service-tls" - "dnsNames" (list "osmo-service" (printf "osmo-service.%s" $ns) (printf "osmo-service.%s.svc" $ns) (printf "osmo-service.%s.svc.cluster.local" $ns))) - (dict "name" "osmo-router" "secret" "osmo-router-tls" - "dnsNames" (list "osmo-router" "osmo-router-headless" - (printf "osmo-router.%s" $ns) (printf "osmo-router.%s.svc" $ns) (printf "osmo-router.%s.svc.cluster.local" $ns) - (printf "osmo-router-headless.%s" $ns) (printf "osmo-router-headless.%s.svc" $ns) (printf "osmo-router-headless.%s.svc.cluster.local" $ns))) - (dict "name" "osmo-agent" "secret" "osmo-agent-tls" - "dnsNames" (list "osmo-agent" (printf "osmo-agent.%s" $ns) (printf "osmo-agent.%s.svc" $ns) (printf "osmo-agent.%s.svc.cluster.local" $ns))) - (dict "name" "osmo-logger" "secret" "osmo-logger-tls" - "dnsNames" (list "osmo-logger" (printf "osmo-logger.%s" $ns) (printf "osmo-logger.%s.svc" $ns) (printf "osmo-logger.%s.svc.cluster.local" $ns))) -}} - -{{- if not $gw.tls.certManager.issuerRef }} -{{/* No external issuer — create a self-signed Issuer + a CA Certificate - and a CA Issuer that signs the per-service certs. */}} -apiVersion: cert-manager.io/v1 -kind: Issuer -metadata: - name: {{ $gwName }}-selfsigned -spec: - selfSigned: {} - ---- -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - name: {{ $gwName }}-ca -spec: - isCA: true - commonName: {{ $gwName }}-ca - secretName: {{ $gwName }}-ca-tls - duration: {{ $gw.tls.caDuration | default "87600h" }} - renewBefore: {{ $gw.tls.caRenewBefore | default "720h" }} - privateKey: - algorithm: ECDSA - size: 256 - issuerRef: - name: {{ $gwName }}-selfsigned - kind: Issuer - group: cert-manager.io - ---- -apiVersion: cert-manager.io/v1 -kind: Issuer -metadata: - name: {{ $gwName }}-ca -spec: - ca: - secretName: {{ $gwName }}-ca-tls -{{- end }} - -{{- $issuerName := $gw.tls.certManager.issuerRef.name | default (printf "%s-ca" $gwName) }} -{{- $issuerKind := $gw.tls.certManager.issuerRef.kind | default "Issuer" }} -{{- $issuerGroup := $gw.tls.certManager.issuerRef.group | default "cert-manager.io" }} - -{{- range $u := $upstreams }} ---- -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - name: {{ $u.name }} -spec: - secretName: {{ $u.secret }} - duration: {{ $gw.tls.certDuration | default "43800h" }} - renewBefore: {{ $gw.tls.certRenewBefore | default "360h" }} - commonName: {{ $u.name }} - dnsNames: - {{- range $u.dnsNames }} - - {{ . }} - {{- end }} - privateKey: - algorithm: ECDSA - size: 256 - issuerRef: - name: {{ $issuerName }} - kind: {{ $issuerKind }} - group: {{ $issuerGroup }} -{{- end }} - -{{- end }}{{/* end certManager.enabled */}} -{{- end }}{{/* end gateway.tls.enabled */}} diff --git a/deployments/charts/service/templates/gateway.yaml b/deployments/charts/service/templates/gateway.yaml index 1951c4347..df1d65690 100644 --- a/deployments/charts/service/templates/gateway.yaml +++ b/deployments/charts/service/templates/gateway.yaml @@ -84,7 +84,7 @@ spec: - mountPath: /var/config name: envoy-config readOnly: true - {{- if and $gw.tls.enabled $gw.tls.certManager.enabled }} + {{- if and $gw.tls.enabled $gw.tls.caSecret }} - name: gateway-tls-ca mountPath: /etc/gateway-tls readOnly: true @@ -112,10 +112,10 @@ spec: - name: envoy-config configMap: name: {{ $gwName }}-envoy-config - {{- if and $gw.tls.enabled $gw.tls.certManager.enabled }} + {{- if and $gw.tls.enabled $gw.tls.caSecret }} - name: gateway-tls-ca secret: - secretName: {{ $gwName }}-ca-tls + secretName: {{ $gw.tls.caSecret }} items: - key: ca.crt path: ca.crt diff --git a/deployments/charts/service/templates/logger-service.yaml b/deployments/charts/service/templates/logger-service.yaml index da89eac1a..913ec03d5 100644 --- a/deployments/charts/service/templates/logger-service.yaml +++ b/deployments/charts/service/templates/logger-service.yaml @@ -125,7 +125,7 @@ spec: {{- range $arg := .Values.services.logger.extraArgs }} - {{ $arg | quote }} {{- end }} - {{- include "osmo.upstream-tls-args" (dict "context" .) | nindent 8 }} + {{- include "osmo.upstream-tls-args" (dict "context" . "secretName" .Values.gateway.tls.upstreamCerts.logger) | nindent 8 }} env: {{- include "osmo.configmap-env" . | nindent 8 }} {{- include "osmo.extra-env" .Values.services.logger | nindent 8 }} @@ -146,7 +146,7 @@ spec: {{- end }} imagePullPolicy: {{ .Values.services.logger.imagePullPolicy }} ports: - {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.logger.extraVolumeMounts (and .Values.gateway.tls.enabled .Values.gateway.tls.certManager.enabled) }} + {{- if or .Values.services.configFile.enabled .Values.global.logs.enabled .Values.services.configs.enabled .Values.services.logger.extraVolumeMounts (and .Values.gateway.tls.enabled .Values.gateway.tls.upstreamCerts.logger) }} volumeMounts: {{- end }} {{- if .Values.services.configFile.enabled}} @@ -160,7 +160,7 @@ spec: mountPath: /logs {{- end }} {{- include "osmo.extra-volume-mounts" .Values.services.logger | nindent 8 }} - {{- include "osmo.upstream-tls-volume-mount" (dict "context" .) | nindent 8 }} + {{- include "osmo.upstream-tls-volume-mount" (dict "context" . "secretName" .Values.gateway.tls.upstreamCerts.logger) | nindent 8 }} resources: {{- toYaml .Values.services.logger.resources | nindent 10 }} @@ -206,7 +206,7 @@ spec: {{- include "osmo.extra-sidecars" .Values.services.logger | nindent 6 }} volumes: {{- include "osmo.extra-volumes" .Values.services.logger | nindent 8 }} - {{- include "osmo.upstream-tls-volume" (dict "context" . "secret" "osmo-logger-tls") | nindent 8 }} + {{- include "osmo.upstream-tls-volume" (dict "context" . "secretName" .Values.gateway.tls.upstreamCerts.logger) | nindent 8 }} {{- if .Values.global.logs.enabled }} - name: logs emptyDir: {} diff --git a/deployments/charts/service/templates/router-service.yaml b/deployments/charts/service/templates/router-service.yaml index d37a94fd6..920ed4ebd 100644 --- a/deployments/charts/service/templates/router-service.yaml +++ b/deployments/charts/service/templates/router-service.yaml @@ -120,7 +120,7 @@ spec: - {{ . }} {{- end }} {{- end }} - {{- include "osmo.upstream-tls-args" (dict "context" .) | nindent 8 }} + {{- include "osmo.upstream-tls-args" (dict "context" . "secretName" .Values.gateway.tls.upstreamCerts.router) | nindent 8 }} env: - name: OSMO_SCHEMA_VERSION value: {{ .Values.targetSchema | default "public" }} @@ -163,7 +163,7 @@ spec: protocol: {{ .protocol | default "TCP" }} {{- end }} {{- end }} - {{- if or .Values.global.logs.enabled .Values.services.configFile.enabled .Values.services.router.extraVolumeMounts (and .Values.gateway.tls.enabled .Values.gateway.tls.certManager.enabled) }} + {{- if or .Values.global.logs.enabled .Values.services.configFile.enabled .Values.services.router.extraVolumeMounts (and .Values.gateway.tls.enabled .Values.gateway.tls.upstreamCerts.router) }} volumeMounts: {{- if .Values.services.configFile.enabled}} - mountPath: {{ .Values.services.configFile.path }} @@ -177,7 +177,7 @@ spec: {{- with .Values.services.router.extraVolumeMounts }} {{- toYaml . | nindent 8 }} {{- end }} - {{- include "osmo.upstream-tls-volume-mount" (dict "context" .) | nindent 8 }} + {{- include "osmo.upstream-tls-volume-mount" (dict "context" . "secretName" .Values.gateway.tls.upstreamCerts.router) | nindent 8 }} {{- end }} resources: {{- toYaml .Values.services.router.resources | nindent 10 }} @@ -217,7 +217,7 @@ spec: {{- with .Values.services.router.extraVolumes }} {{- toYaml . | nindent 8 }} {{- end }} - {{- include "osmo.upstream-tls-volume" (dict "context" . "secret" "osmo-router-tls") | nindent 8 }} + {{- include "osmo.upstream-tls-volume" (dict "context" . "secretName" .Values.gateway.tls.upstreamCerts.router) | nindent 8 }} --- diff --git a/deployments/charts/service/values.yaml b/deployments/charts/service/values.yaml index a35fb6c68..98de20410 100644 --- a/deployments/charts/service/values.yaml +++ b/deployments/charts/service/values.yaml @@ -2105,24 +2105,29 @@ gateway: port: 8000 ## TLS for gateway -> upstream traffic. Default: each service mints an - ## ephemeral self-signed cert in-process; Envoy uses common_tls_context: {} - ## (encrypt, no validation). Set certManager.enabled to use cert-manager - ## for validated TLS instead. UI is excluded — stays HTTP. + ## ephemeral self-signed cert in-process; Envoy connects with TLS but does + ## not validate (encryption-only — identity is provided by NetworkPolicy / + ## RBAC). UI is excluded; stays HTTP. + ## + ## To use externally-provisioned certs, set upstreamCerts. to the + ## name of an existing kubernetes.io/tls Secret containing tls.crt + tls.key. + ## To make Envoy validate against a CA, set caSecret to an existing Secret + ## containing ca.crt. How those Secrets get created (cert-manager, Vault + ## CSI, sealed-secrets, manual, etc.) is up to the operator — OSMO just + ## consumes them. tls: enabled: true - ## cert-manager mode validity periods. - caDuration: 87600h # 10y - caRenewBefore: 720h # 30d - certDuration: 43800h # 5y - certRenewBefore: 360h # 15d - - certManager: - ## Switch from in-process self-signed to cert-manager-managed CA + leafs. - ## Requires cert-manager installed in the cluster. - enabled: false - - ## Optional: existing Issuer/ClusterIssuer to use instead of the - ## chart's self-signed CA. Example: - ## issuerRef: {name: vault-issuer, kind: ClusterIssuer, group: cert-manager.io} - issuerRef: {} + ## Per-upstream server certs. Empty string = mint ephemeral self-signed + ## in-process. Otherwise the named Secret is mounted at /etc/osmo/tls + ## and uvicorn loads tls.crt + tls.key from it. + upstreamCerts: + service: "" + router: "" + agent: "" + logger: "" + + ## Optional CA bundle for Envoy upstream validation. When set, Envoy + ## validates upstream certs against ca.crt from this Secret. When empty, + ## TLS is encryption-only (no validation). + caSecret: "" diff --git a/docs/deployment_guide/getting_started/deploy_service.rst b/docs/deployment_guide/getting_started/deploy_service.rst index 15986cbce..dee6f7d4a 100644 --- a/docs/deployment_guide/getting_started/deploy_service.rst +++ b/docs/deployment_guide/getting_started/deploy_service.rst @@ -485,21 +485,21 @@ Create ``osmo_values.yaml`` for the OSMO service with the following sample. # Gateway -> upstream TLS. Enabled by default: each upstream service # (osmo-service, osmo-router, osmo-agent, osmo-logger) mints an # ephemeral self-signed cert in-process at startup, uvicorn serves - # HTTPS on :8000, and Envoy connects with TLS but skips cert validation - # (common_tls_context: {}). UI stays HTTP behind NetworkPolicy. + # HTTPS on :8000, and Envoy connects with TLS but skips cert validation. + # UI stays HTTP behind NetworkPolicy. # - # To switch to validated TLS managed by cert-manager, set - # certManager.enabled=true and (optionally) point at an existing - # Issuer/ClusterIssuer via certManager.issuerRef. See the chart README - # for the full set of fields. + # To use externally-provisioned certs (cert-manager, Vault CSI, + # sealed-secrets, manual — OSMO doesn't care), point upstreamCerts at + # existing kubernetes.io/tls Secrets. To make Envoy validate against a + # CA, set caSecret to an existing Secret containing ca.crt. tls: enabled: true - # certManager: - # enabled: true - # issuerRef: - # name: vault-issuer - # kind: ClusterIssuer - # group: cert-manager.io + # upstreamCerts: + # service: osmo-service-tls + # router: osmo-router-tls + # agent: osmo-agent-tls + # logger: osmo-logger-tls + # caSecret: osmo-gateway-ca # OAuth2 Proxy configuration # Set OIDC issuer URL and client ID from your IdP (e.g. Microsoft Entra ID, Google). See identity_provider_setup. From ed017c6c23c089caa3d4b829568e5944aa5f8531 Mon Sep 17 00:00:00 2001 From: Ryan Li Date: Wed, 6 May 2026 09:36:26 -0700 Subject: [PATCH 8/9] fix --- src/utils/ssl_config.py | 4 ++-- src/utils/tests/test_ssl_config.py | 17 ++++++++++++----- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/utils/ssl_config.py b/src/utils/ssl_config.py index c0272a656..1c8260c05 100644 --- a/src/utils/ssl_config.py +++ b/src/utils/ssl_config.py @@ -110,14 +110,14 @@ def uvicorn_ssl_kwargs(self) -> Dict[str, Any]: all-unset (HTTP), self-signed-only, or both paths set. """ if self.ssl_self_signed: - keyfile, certfile = _mint_ephemeral_self_signed() + keyfile, certfile = mint_ephemeral_self_signed() return {'ssl_keyfile': keyfile, 'ssl_certfile': certfile} if self.ssl_keyfile and self.ssl_certfile: return {'ssl_keyfile': self.ssl_keyfile, 'ssl_certfile': self.ssl_certfile} return {} -def _mint_ephemeral_self_signed() -> Tuple[str, str]: +def mint_ephemeral_self_signed() -> Tuple[str, str]: """Generate an ECDSA P-256 self-signed cert and write it to a temp dir. Returns (keyfile_path, certfile_path). uvicorn opens both at startup and diff --git a/src/utils/tests/test_ssl_config.py b/src/utils/tests/test_ssl_config.py index e3684fc9f..36f1dba1c 100644 --- a/src/utils/tests/test_ssl_config.py +++ b/src/utils/tests/test_ssl_config.py @@ -24,6 +24,7 @@ import unittest from cryptography import x509 +from cryptography.hazmat.primitives import serialization import pydantic from src.utils import ssl_config @@ -76,10 +77,10 @@ def test_self_signed_returns_real_paths(self): class TestEphemeralSelfSigned(unittest.TestCase): - """_mint_ephemeral_self_signed produces a usable cert/key pair on disk.""" + """mint_ephemeral_self_signed produces a usable cert/key pair on disk.""" def setUp(self): - self.keyfile, self.certfile = ssl_config._mint_ephemeral_self_signed() + self.keyfile, self.certfile = ssl_config.mint_ephemeral_self_signed() def test_files_exist_and_are_nonempty(self): self.assertTrue(os.path.isfile(self.keyfile)) @@ -134,7 +135,7 @@ def test_cert_loads_into_uvicorn_style_ssl_context(self): def test_each_call_produces_a_unique_cert(self): # Ephemerality matters — every process start should mint a fresh cert # rather than reusing one across pods. - keyfile2, certfile2 = ssl_config._mint_ephemeral_self_signed() + keyfile2, certfile2 = ssl_config.mint_ephemeral_self_signed() self.assertNotEqual(self.keyfile, keyfile2) self.assertNotEqual(self.certfile, certfile2) with open(self.certfile, 'rb') as f: @@ -142,8 +143,14 @@ def test_each_call_produces_a_unique_cert(self): with open(certfile2, 'rb') as f: cert2 = x509.load_pem_x509_certificate(f.read()) self.assertNotEqual(cert1.serial_number, cert2.serial_number) - self.assertNotEqual(cert1.public_key().public_numbers(), - cert2.public_key().public_numbers()) + # Compare keys via their DER encoding — public_bytes() is on the base + # public-key protocol so this works regardless of which key type + # _mint_ephemeral_self_signed picks (avoids mypy union-attr errors + # over the seven-way PublicKeyTypes union). + pub_format = serialization.PublicFormat.SubjectPublicKeyInfo + self.assertNotEqual( + cert1.public_key().public_bytes(serialization.Encoding.DER, pub_format), + cert2.public_key().public_bytes(serialization.Encoding.DER, pub_format)) if __name__ == '__main__': From 3b90a61a3cc4b651af3a762ca84cacc68db3dae5 Mon Sep 17 00:00:00 2001 From: Ryan Li Date: Wed, 6 May 2026 09:54:38 -0700 Subject: [PATCH 9/9] fix --- src/service/core/tests/test_asyncio_startup.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/service/core/tests/test_asyncio_startup.py b/src/service/core/tests/test_asyncio_startup.py index abfbbb013..3c798172b 100644 --- a/src/service/core/tests/test_asyncio_startup.py +++ b/src/service/core/tests/test_asyncio_startup.py @@ -84,6 +84,7 @@ def test_logger_main_starts_without_default_event_loop(self): progress_file='/tmp/logger-progress', progress_period=60, config_file=None, + uvicorn_ssl_kwargs=lambda: {}, ) with ( @@ -106,6 +107,7 @@ def test_agent_main_starts_without_default_event_loop(self): host='http://127.0.0.1:8000', progress_file='/tmp/agent-progress', config_file=None, + uvicorn_ssl_kwargs=lambda: {}, ) agent_config = types.SimpleNamespace(progress_period=60) @@ -156,7 +158,10 @@ def test_router_main_starts_without_default_event_loop(self): mock.patch('fastapi.applications.FastAPI.add_middleware'), ): router = importlib.import_module('src.service.router.router') - config = types.SimpleNamespace(host='http://127.0.0.1:8000') + config = types.SimpleNamespace( + host='http://127.0.0.1:8000', + uvicorn_ssl_kwargs=lambda: {}, + ) with ( mock.patch.object(router.RouterServiceConfig, 'load', return_value=config),