From e3a0491e55c7a52c9272d0abccef039ed7951468 Mon Sep 17 00:00:00 2001 From: Varsha Prasad Narsing Date: Thu, 11 Jun 2026 21:17:45 -0700 Subject: [PATCH 1/2] feat: default mTLS to permissive and enable card discovery Change mTLS default from disabled to permissive so agents get mTLS automatically when SPIRE is available. Enable card discovery and verified fetch by default. Add deprecation warnings for legacy JWS signing flags. Changes: - Add +kubebuilder:default=permissive on MTLSMode field - Add ConditionTypeMTLSReady and AnnotationMTLSMode constants - Set kagenti.io/mtls-mode annotation on pod template in controller - Set MTLS_MODE env var on authbridge containers in webhook - Treat empty MTLSMode as permissive across all resolution paths (controller, pod mutator, envoy template, resolved config) - Flip --enable-card-discovery and --enable-verified-fetch to true - Add startup deprecation warnings for legacy signing flags - Clean up mtls-mode annotation on AgentRuntime deletion - Update tests for new default behavior Jira: RHAIENG-4944 Signed-off-by: Varsha Prasad Narsing Assisted-By: Claude (Anthropic AI) --- .../crds/agent.kagenti.dev_agentruntimes.yaml | 7 ++-- .../api/v1alpha1/agentruntime_types.go | 7 ++-- kagenti-operator/cmd/main.go | 32 ++++++++++++++++--- .../agent.kagenti.dev_agentruntimes.yaml | 7 ++-- .../controller/agentruntime_controller.go | 27 ++++++++++++---- .../webhook/injector/agentruntime_config.go | 2 +- .../webhook/injector/envoy_template.go | 17 +++++----- .../webhook/injector/envoy_template_test.go | 7 ++-- .../internal/webhook/injector/pod_mutator.go | 23 +++++++++++-- .../webhook/injector/pod_mutator_test.go | 6 +++- .../webhook/injector/resolved_config.go | 2 +- 11 files changed, 100 insertions(+), 37 deletions(-) diff --git a/charts/kagenti-operator/crds/agent.kagenti.dev_agentruntimes.yaml b/charts/kagenti-operator/crds/agent.kagenti.dev_agentruntimes.yaml index 05504e25..60f96d49 100644 --- a/charts/kagenti-operator/crds/agent.kagenti.dev_agentruntimes.yaml +++ b/charts/kagenti-operator/crds/agent.kagenti.dev_agentruntimes.yaml @@ -118,6 +118,7 @@ spec: type: object type: object mtlsMode: + default: permissive description: |- MTLSMode selects the mTLS posture between authbridge sidecars on the proxy-sidecar / lite paths. envoy-sidecar handles transport @@ -127,8 +128,8 @@ spec: Three valid values: - disabled Plaintext between sidecars (default). - permissive Inbound: byte-peek listener accepts both TLS and + disabled Plaintext between sidecars. + permissive (default) Inbound: byte-peek listener accepts both TLS and plaintext on the same port. Outbound: tries TLS, falls back to plaintext on handshake failure (one-line WARN log per fallback). Use during rollout. @@ -137,7 +138,7 @@ spec: completes. Resolution: AgentRuntime CR > namespace authbridge-runtime-config - mtls.mode > "disabled". Setting mtlsMode != disabled implicitly + mtls.mode > "permissive". Setting mtlsMode != disabled implicitly requires SPIRE — the operator auto-enables spire for the workload. CR-empty vs CR="disabled" are observably different in diff --git a/kagenti-operator/api/v1alpha1/agentruntime_types.go b/kagenti-operator/api/v1alpha1/agentruntime_types.go index 4fde168a..50ca0bba 100644 --- a/kagenti-operator/api/v1alpha1/agentruntime_types.go +++ b/kagenti-operator/api/v1alpha1/agentruntime_types.go @@ -86,8 +86,8 @@ type AgentRuntimeSpec struct { // // Three valid values: // - // disabled Plaintext between sidecars (default). - // permissive Inbound: byte-peek listener accepts both TLS and + // disabled Plaintext between sidecars. + // permissive (default) Inbound: byte-peek listener accepts both TLS and // plaintext on the same port. Outbound: tries TLS, // falls back to plaintext on handshake failure (one-line // WARN log per fallback). Use during rollout. @@ -96,7 +96,7 @@ type AgentRuntimeSpec struct { // completes. // // Resolution: AgentRuntime CR > namespace authbridge-runtime-config - // mtls.mode > "disabled". Setting mtlsMode != disabled implicitly + // mtls.mode > "permissive". Setting mtlsMode != disabled implicitly // requires SPIRE — the operator auto-enables spire for the workload. // // CR-empty vs CR="disabled" are observably different in @@ -111,6 +111,7 @@ type AgentRuntimeSpec struct { // process start). // // +optional + // +kubebuilder:default=permissive // +kubebuilder:validation:Enum=disabled;permissive;strict MTLSMode string `json:"mtlsMode,omitempty"` } diff --git a/kagenti-operator/cmd/main.go b/kagenti-operator/cmd/main.go index cef42202..a074d6da 100644 --- a/kagenti-operator/cmd/main.go +++ b/kagenti-operator/cmd/main.go @@ -171,10 +171,10 @@ func main() { flag.StringVar(&mlflowCAFile, "mlflow-ca-file", "", "Path to PEM-encoded CA bundle for MLflow TLS verification (appended to system pool)") - flag.BoolVar(&enableCardDiscovery, "enable-card-discovery", false, - "Enable automatic agent card discovery from AgentRuntime workloads into status.card") - flag.BoolVar(&enableVerifiedFetch, "enable-verified-fetch", false, - "Enable mTLS-authenticated fetch of agent cards via SPIFFE identity") + flag.BoolVar(&enableCardDiscovery, "enable-card-discovery", true, + "Enable automatic agent card discovery from AgentRuntime workloads into status.card (set to false to disable)") + flag.BoolVar(&enableVerifiedFetch, "enable-verified-fetch", true, + "Enable mTLS-authenticated fetch of agent cards via SPIFFE identity (set to false as kill switch)") flag.StringVar(&verifiedFetchSpiffeSocket, "verified-fetch-spiffe-socket", "unix:///spiffe-workload-api/spire-agent.sock", "SPIFFE Workload API socket path for verified fetch") @@ -237,6 +237,30 @@ func main() { ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + // Startup info logs for defaults that changed in this release. + if enableCardDiscovery { + setupLog.Info("card discovery enabled by default; set --enable-card-discovery=false to disable") + } + if enableVerifiedFetch { + setupLog.Info("verified fetch enabled by default; set --enable-verified-fetch=false to disable") + } + + // Deprecation warnings for legacy flags that are now superseded by + // mTLS defaults (permissive mode auto-enables SPIRE and identity binding). + for _, dep := range []struct { + name string + set bool + }{ + {"require-a2a-signature", requireA2ASignature}, + {"signature-audit-mode", signatureAuditMode}, + {"enforce-network-policies", enforceNetworkPolicies}, + } { + if dep.set { + setupLog.Info("DEPRECATED: flag is superseded by mTLS permissive default; will be removed in a future release", + "flag", dep.name) + } + } + ctx := ctrl.SetupSignalHandler() // ======================================== diff --git a/kagenti-operator/config/crd/bases/agent.kagenti.dev_agentruntimes.yaml b/kagenti-operator/config/crd/bases/agent.kagenti.dev_agentruntimes.yaml index 05504e25..60f96d49 100644 --- a/kagenti-operator/config/crd/bases/agent.kagenti.dev_agentruntimes.yaml +++ b/kagenti-operator/config/crd/bases/agent.kagenti.dev_agentruntimes.yaml @@ -118,6 +118,7 @@ spec: type: object type: object mtlsMode: + default: permissive description: |- MTLSMode selects the mTLS posture between authbridge sidecars on the proxy-sidecar / lite paths. envoy-sidecar handles transport @@ -127,8 +128,8 @@ spec: Three valid values: - disabled Plaintext between sidecars (default). - permissive Inbound: byte-peek listener accepts both TLS and + disabled Plaintext between sidecars. + permissive (default) Inbound: byte-peek listener accepts both TLS and plaintext on the same port. Outbound: tries TLS, falls back to plaintext on handshake failure (one-line WARN log per fallback). Use during rollout. @@ -137,7 +138,7 @@ spec: completes. Resolution: AgentRuntime CR > namespace authbridge-runtime-config - mtls.mode > "disabled". Setting mtlsMode != disabled implicitly + mtls.mode > "permissive". Setting mtlsMode != disabled implicitly requires SPIRE — the operator auto-enables spire for the workload. CR-empty vs CR="disabled" are observably different in diff --git a/kagenti-operator/internal/controller/agentruntime_controller.go b/kagenti-operator/internal/controller/agentruntime_controller.go index f435aa64..a49672ed 100644 --- a/kagenti-operator/internal/controller/agentruntime_controller.go +++ b/kagenti-operator/internal/controller/agentruntime_controller.go @@ -63,6 +63,10 @@ const ( // Value is a JSON array of skill names, set by the kagenti backend or the user. AnnotationSkills = "kagenti.io/skills" + // AnnotationMTLSMode is the annotation applied to PodTemplateSpec to advertise the + // resolved mTLS posture. Read by authbridge sidecars for observability. + AnnotationMTLSMode = "kagenti.io/mtls-mode" + // AnnotationRestartPending marks a Sandbox that was scaled to 0 and needs // to be scaled back to 1 on the next reconcile cycle. Two-phase restart // avoids a race with the Sandbox controller's pod-name annotation. @@ -72,6 +76,7 @@ const ( ConditionTypeReady = "Ready" ConditionTypeTargetResolved = "TargetResolved" ConditionTypeConfigResolved = "ConfigResolved" + ConditionTypeMTLSReady = "MTLSReady" // AnnotationLastCardFetchHash stores the change-detection key used to skip // redundant card fetches when the workload's pod template has not changed. @@ -333,6 +338,12 @@ func (r *AgentRuntimeReconciler) applyWorkloadConfig(ctx context.Context, rt *ag key := types.NamespacedName{Name: ref.Name, Namespace: rt.Namespace} + // Resolve mTLS mode: CR value takes precedence, default to "permissive". + mtlsMode := rt.Spec.MTLSMode + if mtlsMode == "" { + mtlsMode = "permissive" + } + var configHashChanged bool err := retry.RetryOnConflict(retry.DefaultRetry, func() error { @@ -348,7 +359,8 @@ func (r *AgentRuntimeReconciler) applyWorkloadConfig(ctx context.Context, rt *ag alreadyConfigured := currentWorkloadLabels[LabelAgentType] == string(rt.Spec.Type) && currentWorkloadLabels[LabelManagedBy] == LabelManagedByValue && currentPodLabels[LabelAgentType] == string(rt.Spec.Type) && - currentPodAnnotations[AnnotationConfigHash] == configHash + currentPodAnnotations[AnnotationConfigHash] == configHash && + currentPodAnnotations[AnnotationMTLSMode] == mtlsMode if alreadyConfigured { return nil @@ -375,19 +387,21 @@ func (r *AgentRuntimeReconciler) applyWorkloadConfig(ctx context.Context, rt *ag podLabels[LabelAgentType] = string(rt.Spec.Type) acc.setPodLabels(acc.obj, podLabels) - // Apply config-hash annotation to PodTemplateSpec + // Apply config-hash and mtls-mode annotations to PodTemplateSpec podAnnotations := acc.getPodAnnotations(acc.obj) if podAnnotations == nil { podAnnotations = make(map[string]string) } podAnnotations[AnnotationConfigHash] = configHash + podAnnotations[AnnotationMTLSMode] = mtlsMode acc.setPodAnnotations(acc.obj, podAnnotations) logger.Info("Applying config to workload", "workload", ref.Name, "kind", ref.Kind, "type", string(rt.Spec.Type), - "configHash", configHash[:12]) + "configHash", configHash[:12], + "mtlsMode", mtlsMode) return r.Update(ctx, acc.obj) }) @@ -725,11 +739,12 @@ func (r *AgentRuntimeReconciler) handleDeletion(ctx context.Context, rt *agentv1 delete(podLabels, LabelAgentType) acc.setPodLabels(acc.obj, podLabels) - // Remove kagenti.io/config-hash from PodTemplateSpec pod annotations. - // This triggers the rolling update that replaces existing injected pods, - // and leaves the workload annotation-clean for any future AR. + // Remove kagenti.io/config-hash and kagenti.io/mtls-mode from PodTemplateSpec + // pod annotations. This triggers the rolling update that replaces existing + // injected pods, and leaves the workload annotation-clean for any future AR. podAnnotations := acc.getPodAnnotations(acc.obj) delete(podAnnotations, AnnotationConfigHash) + delete(podAnnotations, AnnotationMTLSMode) acc.setPodAnnotations(acc.obj, podAnnotations) logger.Info("Removed kagenti labels and config-hash from workload on AgentRuntime deletion", diff --git a/kagenti-operator/internal/webhook/injector/agentruntime_config.go b/kagenti-operator/internal/webhook/injector/agentruntime_config.go index 82280b5d..ee2e7d73 100644 --- a/kagenti-operator/internal/webhook/injector/agentruntime_config.go +++ b/kagenti-operator/internal/webhook/injector/agentruntime_config.go @@ -56,7 +56,7 @@ type AgentRuntimeOverrides struct { // mTLS posture — from .spec.mtlsMode // Nil = no per-workload override; the namespace's - // authbridge-runtime-config mtls.mode (if set) or "disabled" + // authbridge-runtime-config mtls.mode (if set) or "permissive" // applies. MTLSMode *string } diff --git a/kagenti-operator/internal/webhook/injector/envoy_template.go b/kagenti-operator/internal/webhook/injector/envoy_template.go index 74be9aca..15781ab7 100644 --- a/kagenti-operator/internal/webhook/injector/envoy_template.go +++ b/kagenti-operator/internal/webhook/injector/envoy_template.go @@ -72,19 +72,20 @@ func RenderEnvoyConfig(cfg *ResolvedConfig) (string, error) { return cfg.EnvoyYAML, nil } - // MTLSEnabled checks both "" and MTLSModeDisabled because - // ResolvedConfig leaves MTLSMode as "" when no source set it - // (CR / namespace ConfigMap / default — see ResolveConfig). The - // resolution chain only fills MTLSMode when something explicitly - // asked for it, so "" means "no opinion → treat as disabled". + // MTLSEnabled: empty string is treated as permissive (mTLS is on + // by default). Only MTLSModeDisabled explicitly disables mTLS. + effectiveMode := cfg.MTLSMode + if effectiveMode == "" { + effectiveMode = MTLSModePermissive + } data := envoyTemplateData{ AdminPort: cfg.Platform.Proxy.AdminPort, OutboundPort: cfg.Platform.Proxy.Port, InboundPort: cfg.Platform.Proxy.InboundProxyPort, ExtProcPort: defaultExtProcPort, - MTLSEnabled: cfg.MTLSMode != "" && cfg.MTLSMode != MTLSModeDisabled, - MTLSPermissive: cfg.MTLSMode == MTLSModePermissive, - MTLSStrict: cfg.MTLSMode == MTLSModeStrict, + MTLSEnabled: effectiveMode != MTLSModeDisabled, + MTLSPermissive: effectiveMode == MTLSModePermissive, + MTLSStrict: effectiveMode == MTLSModeStrict, } var buf bytes.Buffer diff --git a/kagenti-operator/internal/webhook/injector/envoy_template_test.go b/kagenti-operator/internal/webhook/injector/envoy_template_test.go index c12e14f0..6ed94d6e 100644 --- a/kagenti-operator/internal/webhook/injector/envoy_template_test.go +++ b/kagenti-operator/internal/webhook/injector/envoy_template_test.go @@ -70,10 +70,9 @@ func TestRenderEnvoyConfig_TemplateRendering(t *testing.T) { } func TestRenderEnvoyConfig_MTLSDisabled_NoTLSBlocks(t *testing.T) { - // Default / disabled mode — no TLS blocks should render. Locks in - // the existing plaintext shape so a future template edit can't - // silently leak TLS config into pods that didn't ask for it. - for _, mode := range []string{"", MTLSModeDisabled} { + // Explicitly disabled mode — no TLS blocks should render. + // Empty string is now treated as permissive (mTLS on by default). + for _, mode := range []string{MTLSModeDisabled} { t.Run("mode="+mode, func(t *testing.T) { cfg := &ResolvedConfig{ Platform: config.CompiledDefaults(), diff --git a/kagenti-operator/internal/webhook/injector/pod_mutator.go b/kagenti-operator/internal/webhook/injector/pod_mutator.go index 53ff0bab..3fb1754c 100644 --- a/kagenti-operator/internal/webhook/injector/pod_mutator.go +++ b/kagenti-operator/internal/webhook/injector/pod_mutator.go @@ -257,7 +257,7 @@ func (m *PodMutator) InjectAuthBridge(ctx context.Context, podSpec *corev1.PodSp } } if mtlsMode == "" { - mtlsMode = MTLSModeDisabled + mtlsMode = MTLSModePermissive mtlsSource = "default" } // Defense in depth: the CRD enum check rejects unknown values at @@ -270,10 +270,10 @@ func (m *PodMutator) InjectAuthBridge(ctx context.Context, podSpec *corev1.PodSp case MTLSModeDisabled, MTLSModePermissive, MTLSModeStrict: // recognized, keep as-is default: - mutatorLog.Info("WARN: unrecognized mtlsMode; defaulting to disabled", + mutatorLog.Info("WARN: unrecognized mtlsMode; defaulting to permissive", "namespace", namespace, "crName", crName, "unrecognized", mtlsMode, "source", mtlsSource) - mtlsMode = MTLSModeDisabled + mtlsMode = MTLSModePermissive mtlsSource = "default-invalid-fallback" } mutatorLog.Info("resolved mTLS mode", @@ -516,6 +516,15 @@ func (m *PodMutator) InjectAuthBridge(ctx context.Context, podSpec *corev1.PodSp )) } + // Set MTLS_MODE env var on the authbridge container so it knows the + // resolved mTLS posture at runtime. + for i := range podSpec.Containers { + if podSpec.Containers[i].Name == AuthBridgeProxyContainerName { + setOrAddEnv(&podSpec.Containers[i], "MTLS_MODE", mtlsMode) + break + } + } + // Inject HTTP_PROXY env vars into all existing app containers for i := range podSpec.Containers { c := &podSpec.Containers[i] @@ -620,6 +629,14 @@ func (m *PodMutator) InjectAuthBridge(ctx context.Context, podSpec *corev1.PodSp podSpec.Containers = append(podSpec.Containers, builder.BuildEnvoyProxyContainerWithSpireOption(spireEnabled)) } + // Set MTLS_MODE env var on the envoy-sidecar authbridge container. + for i := range podSpec.Containers { + if podSpec.Containers[i].Name == EnvoyProxyContainerName { + setOrAddEnv(&podSpec.Containers[i], "MTLS_MODE", mtlsMode) + break + } + } + if decision.ProxyInit.Inject && !containerExists(podSpec.InitContainers, ProxyInitContainerName) { outboundExclude := annotations[OutboundPortsExcludeAnnotation] inboundExclude := annotations[InboundPortsExcludeAnnotation] diff --git a/kagenti-operator/internal/webhook/injector/pod_mutator_test.go b/kagenti-operator/internal/webhook/injector/pod_mutator_test.go index 8abd0c4d..b113ce58 100644 --- a/kagenti-operator/internal/webhook/injector/pod_mutator_test.go +++ b/kagenti-operator/internal/webhook/injector/pod_mutator_test.go @@ -233,7 +233,11 @@ func TestInjectAuthBridge_RespectsExistingServiceAccountName(t *testing.T) { func TestInjectAuthBridge_NoSACreationWhenSpiffeHelperDisabled(t *testing.T) { // Spiffe-helper is injected by default for agents. SA creation is skipped // when spiffe-helper is explicitly opted out via its per-sidecar label. - m := newTestMutator(newAgentRuntime("test-ns", "my-agent")) + // MTLSMode must be set to "disabled" because the default (permissive) would + // auto-enable SPIRE, creating a ServiceAccount regardless of the spiffe-helper label. + rt := newAgentRuntime("test-ns", "my-agent") + rt.Spec.MTLSMode = "disabled" + m := newTestMutator(rt) ctx := context.Background() podSpec := &corev1.PodSpec{} diff --git a/kagenti-operator/internal/webhook/injector/resolved_config.go b/kagenti-operator/internal/webhook/injector/resolved_config.go index 75648cb5..50a8b0b1 100644 --- a/kagenti-operator/internal/webhook/injector/resolved_config.go +++ b/kagenti-operator/internal/webhook/injector/resolved_config.go @@ -59,7 +59,7 @@ type ResolvedConfig struct { // raw AuthBridgeRuntimeYAML so callers (e.g. RenderEnvoyConfig) can // branch on the resolved values without re-parsing the YAML. // AuthBridgeMode is "" when no source set it (caller picks the default). - // MTLSMode is "" when no source set it (caller treats as "disabled"). + // MTLSMode is "" when no source set it (caller treats as "permissive"). AuthBridgeMode string MTLSMode string } From ffdc5e567c74b89d005af0e8aeec408dec759a2e Mon Sep 17 00:00:00 2001 From: Varsha Prasad Narsing Date: Fri, 12 Jun 2026 11:57:44 -0700 Subject: [PATCH 2/2] fix(e2e): set mtlsMode: disabled on authbridge test fixtures E2E authbridge tests were failing because the mTLS default changed from disabled to permissive. The envoy-proxy container crash-looped as the envoy template now renders TLS contexts by default, but the E2E test fixtures aren't configured for full mTLS infrastructure. Set mtlsMode: disabled explicitly on both authbridge-agent and authbridge-disabled-agent AgentRuntime fixtures to preserve the existing test behavior. These tests validate sidecar injection, not mTLS transport. Signed-off-by: Varsha Prasad Narsing Assisted-By: Claude (Anthropic AI) --- kagenti-operator/test/e2e/fixtures.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kagenti-operator/test/e2e/fixtures.go b/kagenti-operator/test/e2e/fixtures.go index 2c0f7e52..66fb8ad9 100644 --- a/kagenti-operator/test/e2e/fixtures.go +++ b/kagenti-operator/test/e2e/fixtures.go @@ -959,6 +959,7 @@ metadata: namespace: ` + authBridgeTestNamespace + ` spec: type: agent + mtlsMode: disabled targetRef: apiVersion: apps/v1 kind: Deployment @@ -1092,6 +1093,7 @@ metadata: namespace: ` + authBridgeTestNamespace + ` spec: type: agent + mtlsMode: disabled targetRef: apiVersion: apps/v1 kind: Deployment