From d65f3d5587667cbc67acf7bddf39ec7c3bfd23f7 Mon Sep 17 00:00:00 2001 From: Kevin Cogan Date: Tue, 14 Apr 2026 15:30:39 +0100 Subject: [PATCH 01/16] feat(signature): add shared JWS signer and SPIFFE signer for AgentCard signing Introduce a reusable SignCard function that produces JWS x5c signatures, supporting ECDSA (P-256/P-384/P-521) and RSA key types with RFC 7518 fixed-width R||S encoding. Add SpiffeSigner that wraps a SPIFFE Workload API X509Source to sign AgentCards with the workload's SVID. Signed-off-by: Kevin Cogan Signed-off-by: Kevin Cogan --- kagenti-operator/internal/signature/signer.go | 199 ++++++++++++++++++ .../internal/signature/signer_test.go | 116 ++++++++++ .../internal/signature/spiffe_signer.go | 129 ++++++++++++ .../internal/signature/spiffe_signer_test.go | 161 ++++++++++++++ 4 files changed, 605 insertions(+) create mode 100644 kagenti-operator/internal/signature/signer.go create mode 100644 kagenti-operator/internal/signature/signer_test.go create mode 100644 kagenti-operator/internal/signature/spiffe_signer.go create mode 100644 kagenti-operator/internal/signature/spiffe_signer_test.go diff --git a/kagenti-operator/internal/signature/signer.go b/kagenti-operator/internal/signature/signer.go new file mode 100644 index 00000000..02c240cb --- /dev/null +++ b/kagenti-operator/internal/signature/signer.go @@ -0,0 +1,199 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package signature + +import ( + "context" + "crypto" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/rsa" + "crypto/sha256" + "crypto/x509" + "encoding/base64" + "encoding/json" + "fmt" + + agentv1alpha1 "github.com/kagenti/operator/api/v1alpha1" +) + +// Signer produces JWS x5c signatures on AgentCard data using a SPIFFE identity. +type Signer interface { + SignCard(ctx context.Context, cardData *agentv1alpha1.AgentCardData) ([]byte, error) + SpiffeID() string + Ready() bool + Close() error +} + +// SignCard signs AgentCard data with the given private key and certificate chain, +// producing a JWS x5c signed JSON output. This is the shared signing logic used +// by both the operator and the init-container. +func SignCard(cardData *agentv1alpha1.AgentCardData, privateKey crypto.Signer, certs []*x509.Certificate) ([]byte, error) { + if cardData == nil { + return nil, fmt.Errorf("card data is nil") + } + if len(certs) == 0 { + return nil, fmt.Errorf("no certificates in SVID chain") + } + leaf := certs[0] + + alg, err := AlgorithmForKey(privateKey.Public()) + if err != nil { + return nil, err + } + + kid := ComputeKID(leaf) + + x5c := make([]string, len(certs)) + for i, cert := range certs { + x5c[i] = base64.StdEncoding.EncodeToString(cert.Raw) + } + + header := &ProtectedHeader{ + Algorithm: alg, + KeyID: kid, + Type: "JOSE", + X5C: x5c, + } + + protectedB64, err := EncodeProtectedHeader(header) + if err != nil { + return nil, fmt.Errorf("failed to encode protected header: %w", err) + } + + payload, err := CreateCanonicalCardJSON(cardData) + if err != nil { + return nil, fmt.Errorf("failed to create canonical JSON: %w", err) + } + + payloadB64 := base64.RawURLEncoding.EncodeToString(payload) + signingInput := []byte(protectedB64 + "." + payloadB64) + + sigBytes, err := SignInput(privateKey, alg, signingInput) + if err != nil { + return nil, fmt.Errorf("signing failed: %w", err) + } + + sigB64 := base64.RawURLEncoding.EncodeToString(sigBytes) + + cardData.Signatures = []agentv1alpha1.AgentCardSignature{ + { + Protected: protectedB64, + Signature: sigB64, + }, + } + + output, err := json.MarshalIndent(cardData, "", " ") + if err != nil { + return nil, fmt.Errorf("failed to marshal signed card: %w", err) + } + + return output, nil +} + +// AlgorithmForKey maps a public key type to its JWS algorithm. +func AlgorithmForKey(pub crypto.PublicKey) (string, error) { + switch k := pub.(type) { + case *rsa.PublicKey: + if k.N.BitLen() < 2048 { + return "", fmt.Errorf("RSA key too small: %d bits (minimum 2048)", k.N.BitLen()) + } + return "RS256", nil + case *ecdsa.PublicKey: + switch k.Curve { + case elliptic.P256(): + return "ES256", nil + case elliptic.P384(): + return "ES384", nil + case elliptic.P521(): + return "ES512", nil + default: + return "", fmt.Errorf("unsupported ECDSA curve: %s", k.Curve.Params().Name) + } + default: + return "", fmt.Errorf("unsupported key type: %T", pub) + } +} + +// ComputeKID derives a key ID from the leaf cert's SHA-256 fingerprint (first 8 bytes). +func ComputeKID(leaf *x509.Certificate) string { + fp := sha256.Sum256(leaf.Raw) + return fmt.Sprintf("%x", fp[:8]) +} + +// SignInput signs the JWS signing input with the appropriate algorithm. +func SignInput(signer crypto.Signer, alg string, input []byte) ([]byte, error) { + hashFunc, err := HashForAlgorithm(alg) + if err != nil { + return nil, err + } + + h := hashFunc.New() + h.Write(input) + hashed := h.Sum(nil) + + switch alg { //nolint:goconst // JWS algorithm identifiers are well-known strings + case "RS256", "RS384", "RS512": + return signer.Sign(rand.Reader, hashed, hashFunc) + case "ES256", "ES384", "ES512": + return SignECDSARaw(signer, hashed, alg) + default: + return nil, fmt.Errorf("unsupported algorithm: %s", alg) + } +} + +// SignECDSARaw signs with ECDSA and encodes as fixed-width R||S (RFC 7518 §3.4). +func SignECDSARaw(signer crypto.Signer, hashed []byte, alg string) ([]byte, error) { + ecKey, ok := signer.(*ecdsa.PrivateKey) + if !ok { + return nil, fmt.Errorf("expected *ecdsa.PrivateKey, got %T", signer) + } + + r, s, err := ecdsa.Sign(rand.Reader, ecKey, hashed) + if err != nil { + return nil, fmt.Errorf("ECDSA sign failed: %w", err) + } + + keySize := CurveByteSize(ecKey.Curve) + sig := make([]byte, 2*keySize) + rBytes := r.Bytes() + sBytes := s.Bytes() + copy(sig[keySize-len(rBytes):keySize], rBytes) + copy(sig[2*keySize-len(sBytes):], sBytes) + + return sig, nil +} + +// ZeroPrivateKey zeroes private key material in memory (best-effort). +func ZeroPrivateKey(key crypto.Signer) { + switch k := key.(type) { + case *ecdsa.PrivateKey: + if k.D != nil { + k.D.SetInt64(0) + } + case *rsa.PrivateKey: + if k.D != nil { + k.D.SetInt64(0) + } + for _, p := range k.Primes { + if p != nil { + p.SetInt64(0) + } + } + } +} diff --git a/kagenti-operator/internal/signature/signer_test.go b/kagenti-operator/internal/signature/signer_test.go new file mode 100644 index 00000000..7288a0bc --- /dev/null +++ b/kagenti-operator/internal/signature/signer_test.go @@ -0,0 +1,116 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package signature + +import ( + "context" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/x509" + "encoding/json" + "testing" + + agentv1alpha1 "github.com/kagenti/operator/api/v1alpha1" +) + +func TestSignCard_ECDSA_P256(t *testing.T) { + ca := newTestCA(t) + key, _ := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + leaf := ca.issueLeaf(t, key, leafOpts{ + spiffeIDs: []string{"spiffe://example.org/ns/default/sa/test"}, + }) + + card := testCard() + output, err := SignCard(card, key, []*x509.Certificate{leaf, ca.Cert}) + if err != nil { + t.Fatalf("SignCard failed: %v", err) + } + + var parsed agentv1alpha1.AgentCardData + if err := json.Unmarshal(output, &parsed); err != nil { + t.Fatalf("output is not valid JSON: %v", err) + } + if len(parsed.Signatures) != 1 { + t.Fatalf("expected 1 signature, got %d", len(parsed.Signatures)) + } + + header, err := DecodeProtectedHeader(parsed.Signatures[0].Protected) + if err != nil { + t.Fatalf("failed to decode protected header: %v", err) + } + if header.Algorithm != "ES256" { + t.Errorf("expected alg=ES256, got %s", header.Algorithm) + } + if header.Type != "JOSE" { + t.Errorf("expected typ=JOSE, got %s", header.Type) + } + if len(header.X5C) != 2 { + t.Errorf("expected 2 certs in x5c, got %d", len(header.X5C)) + } +} + +func TestSignCard_NilCardData(t *testing.T) { + key, _ := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + _, err := SignCard(nil, key, []*x509.Certificate{{}}) + if err == nil { + t.Error("expected error for nil card data") + } +} + +func TestSignCard_NoCertificates(t *testing.T) { + key, _ := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + _, err := SignCard(testCard(), key, nil) + if err == nil { + t.Error("expected error for empty cert chain") + } +} + +// CRITICAL: Round-trip test proving operator-signed cards verify with X5CProvider. +func TestSignCard_RoundTrip_X5CProvider(t *testing.T) { + ca := newTestCA(t) + key, _ := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + leaf := ca.issueLeaf(t, key, leafOpts{ + spiffeIDs: []string{"spiffe://example.org/ns/default/sa/operator"}, + }) + + card := testCard() + output, err := SignCard(card, key, []*x509.Certificate{leaf, ca.Cert}) + if err != nil { + t.Fatalf("SignCard failed: %v", err) + } + + var parsed agentv1alpha1.AgentCardData + if err := json.Unmarshal(output, &parsed); err != nil { + t.Fatalf("output is not valid JSON: %v", err) + } + + provider := newTestX5CProvider(t, ca) + cardWithoutSigs := parsed + cardWithoutSigs.Signatures = nil + + result, err := provider.VerifySignature(context.Background(), &cardWithoutSigs, parsed.Signatures) + if err != nil { + t.Fatalf("X5CProvider.VerifySignature error: %v", err) + } + if !result.Verified { + t.Errorf("round-trip failed: X5CProvider rejected SignCard output: %s", result.Details) + } + if result.SpiffeID != "spiffe://example.org/ns/default/sa/operator" { + t.Errorf("expected SPIFFE ID from cert, got %q", result.SpiffeID) + } +} diff --git a/kagenti-operator/internal/signature/spiffe_signer.go b/kagenti-operator/internal/signature/spiffe_signer.go new file mode 100644 index 00000000..cdf6c29c --- /dev/null +++ b/kagenti-operator/internal/signature/spiffe_signer.go @@ -0,0 +1,129 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package signature + +import ( + "context" + "fmt" + "sync" + + "github.com/spiffe/go-spiffe/v2/svid/x509svid" + "github.com/spiffe/go-spiffe/v2/workloadapi" + + agentv1alpha1 "github.com/kagenti/operator/api/v1alpha1" +) + +// X509SVIDSource abstracts the SPIFFE X509Source for testability. +type X509SVIDSource interface { + GetX509SVID() (*x509svid.SVID, error) + Close() error +} + +// SpiffeSigner implements the Signer interface using a SPIFFE X509Source +// that auto-rotates SVIDs as they approach expiry. +type SpiffeSigner struct { + source X509SVIDSource + mu sync.RWMutex + closed bool +} + +// NewSpiffeSigner creates a SpiffeSigner backed by the SPIFFE Workload API. +// The X509Source automatically rotates SVIDs before expiry. +func NewSpiffeSigner(ctx context.Context, socketPath string) (*SpiffeSigner, error) { + source, err := workloadapi.NewX509Source(ctx, workloadapi.WithClientOptions(workloadapi.WithAddr(socketPath))) + if err != nil { + return nil, fmt.Errorf("failed to create X509Source: %w", err) + } + + return &SpiffeSigner{ + source: source, + }, nil +} + +// NewSpiffeSignerWithSource creates a SpiffeSigner with a custom X509SVIDSource (for testing). +func NewSpiffeSignerWithSource(source X509SVIDSource) *SpiffeSigner { + return &SpiffeSigner{ + source: source, + } +} + +// SignCard signs the AgentCard data using the current SVID from the X509Source. +func (s *SpiffeSigner) SignCard(ctx context.Context, cardData *agentv1alpha1.AgentCardData) ([]byte, error) { + s.mu.RLock() + defer s.mu.RUnlock() + + if s.closed { + return nil, fmt.Errorf("signer is closed") + } + + svid, err := s.source.GetX509SVID() + if err != nil { + return nil, fmt.Errorf("failed to get X509-SVID: %w", err) + } + + signed, err := SignCard(cardData, svid.PrivateKey, svid.Certificates) + if err != nil { + return nil, fmt.Errorf("signing failed: %w", err) + } + + // Do NOT zero svid.PrivateKey here: the X509Source owns the SVID and + // returns a shared reference. Zeroing it would corrupt future calls. + return signed, nil +} + +// SpiffeID returns the SPIFFE ID from the current SVID, or empty string if unavailable. +func (s *SpiffeSigner) SpiffeID() string { + s.mu.RLock() + defer s.mu.RUnlock() + + if s.closed { + return "" + } + + svid, err := s.source.GetX509SVID() + if err != nil { + return "" + } + + return svid.ID.String() +} + +// Ready returns true when the signer has obtained at least one SVID. +func (s *SpiffeSigner) Ready() bool { + s.mu.RLock() + defer s.mu.RUnlock() + + if s.closed { + return false + } + + _, err := s.source.GetX509SVID() + return err == nil +} + +// Close shuts down the X509Source and releases resources. +func (s *SpiffeSigner) Close() error { + s.mu.Lock() + defer s.mu.Unlock() + + if s.closed { + return nil + } + + s.closed = true + return s.source.Close() +} diff --git a/kagenti-operator/internal/signature/spiffe_signer_test.go b/kagenti-operator/internal/signature/spiffe_signer_test.go new file mode 100644 index 00000000..31c8aba4 --- /dev/null +++ b/kagenti-operator/internal/signature/spiffe_signer_test.go @@ -0,0 +1,161 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package signature + +import ( + "context" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/x509" + "fmt" + "testing" + + "github.com/spiffe/go-spiffe/v2/spiffeid" + "github.com/spiffe/go-spiffe/v2/svid/x509svid" +) + +type mockX509Source struct { + svid *x509svid.SVID + err error + closed bool + callCnt int +} + +func (m *mockX509Source) GetX509SVID() (*x509svid.SVID, error) { + m.callCnt++ + if m.err != nil { + return nil, m.err + } + return m.svid, nil +} + +func (m *mockX509Source) Close() error { + m.closed = true + return nil +} + +func newMockSVID(t *testing.T, ca *testCA, spiffeIDStr string) *x509svid.SVID { + t.Helper() + key, _ := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + leaf := ca.issueLeaf(t, key, leafOpts{ + spiffeIDs: []string{spiffeIDStr}, + }) + td, _ := spiffeid.TrustDomainFromString("example.org") + id, _ := spiffeid.FromPath(td, "/ns/default/sa/test") + return &x509svid.SVID{ + ID: id, + Certificates: []*x509.Certificate{leaf, ca.Cert}, + PrivateKey: key, + } +} + +func TestSpiffeSigner_SignCard_Success(t *testing.T) { + ca := newTestCA(t) + svid := newMockSVID(t, ca, "spiffe://example.org/ns/default/sa/test") + source := &mockX509Source{svid: svid} + + signer := NewSpiffeSignerWithSource(source) + defer signer.Close() //nolint:errcheck // test cleanup + + card := testCard() + output, err := signer.SignCard(context.Background(), card) + if err != nil { + t.Fatalf("SignCard failed: %v", err) + } + if len(output) == 0 { + t.Error("expected non-empty signed output") + } +} + +func TestSpiffeSigner_NotReady(t *testing.T) { + source := &mockX509Source{err: fmt.Errorf("no SVID available")} + signer := NewSpiffeSignerWithSource(source) + defer signer.Close() //nolint:errcheck // test cleanup + + if signer.Ready() { + t.Error("expected Ready()=false when source returns error") + } + + _, err := signer.SignCard(context.Background(), testCard()) + if err == nil { + t.Error("expected error when signer not ready") + } +} + +func TestSpiffeSigner_SpiffeID(t *testing.T) { + ca := newTestCA(t) + svid := newMockSVID(t, ca, "spiffe://example.org/ns/default/sa/test") + source := &mockX509Source{svid: svid} + + signer := NewSpiffeSignerWithSource(source) + defer signer.Close() //nolint:errcheck // test cleanup + + id := signer.SpiffeID() + if id != "spiffe://example.org/ns/default/sa/test" { + t.Errorf("expected spiffe://example.org/ns/default/sa/test, got %q", id) + } +} + +func TestSpiffeSigner_SVIDRotation(t *testing.T) { + ca := newTestCA(t) + svid1 := newMockSVID(t, ca, "spiffe://example.org/ns/default/sa/test") + source := &mockX509Source{svid: svid1} + + signer := NewSpiffeSignerWithSource(source) + defer signer.Close() //nolint:errcheck // test cleanup + + card := testCard() + output1, err := signer.SignCard(context.Background(), card) + if err != nil { + t.Fatalf("first SignCard failed: %v", err) + } + + svid2 := newMockSVID(t, ca, "spiffe://example.org/ns/default/sa/test") + source.svid = svid2 + + card2 := testCard() + output2, err := signer.SignCard(context.Background(), card2) + if err != nil { + t.Fatalf("second SignCard failed: %v", err) + } + + if string(output1) == string(output2) { + t.Error("expected different signatures after SVID rotation (different key)") + } +} + +func TestSpiffeSigner_Close(t *testing.T) { + source := &mockX509Source{err: fmt.Errorf("not ready")} + signer := NewSpiffeSignerWithSource(source) + + if err := signer.Close(); err != nil { + t.Fatalf("Close failed: %v", err) + } + if !source.closed { + t.Error("expected source to be closed") + } + + if signer.Ready() { + t.Error("expected Ready()=false after close") + } + + _, err := signer.SignCard(context.Background(), testCard()) + if err == nil { + t.Error("expected error after close") + } +} From 911c5469c4e9db05f21e1e6ef9a6aafac57ad328 Mon Sep 17 00:00:00 2001 From: Kevin Cogan Date: Tue, 14 Apr 2026 15:31:08 +0100 Subject: [PATCH 02/16] feat(agentcard): add ConfigMap writer and refactor init-container signer Add Writer that creates or updates signed AgentCard ConfigMaps using an uncached API reader to avoid cache-scoping issues across namespaces. Refactor the init-container signer to use the shared signature.SignCard function and the canonical agentcard.ConfigMapName constant, eliminating duplicate logic. Signed-off-by: Kevin Cogan Signed-off-by: Kevin Cogan --- kagenti-operator/cmd/agentcard-signer/main.go | 172 +----------------- .../cmd/agentcard-signer/main_test.go | 108 +++++------ kagenti-operator/internal/agentcard/writer.go | 99 ++++++++++ .../internal/agentcard/writer_test.go | 140 ++++++++++++++ 4 files changed, 294 insertions(+), 225 deletions(-) create mode 100644 kagenti-operator/internal/agentcard/writer.go create mode 100644 kagenti-operator/internal/agentcard/writer_test.go diff --git a/kagenti-operator/cmd/agentcard-signer/main.go b/kagenti-operator/cmd/agentcard-signer/main.go index 31fea356..f2522af6 100644 --- a/kagenti-operator/cmd/agentcard-signer/main.go +++ b/kagenti-operator/cmd/agentcard-signer/main.go @@ -18,14 +18,6 @@ package main import ( "context" - "crypto" - "crypto/ecdsa" - "crypto/elliptic" - "crypto/rand" - "crypto/rsa" - "crypto/sha256" - "crypto/x509" - "encoding/base64" "encoding/json" "fmt" "os" @@ -42,6 +34,7 @@ import ( "github.com/spiffe/go-spiffe/v2/workloadapi" agentv1alpha1 "github.com/kagenti/operator/api/v1alpha1" + "github.com/kagenti/operator/internal/agentcard" "github.com/kagenti/operator/internal/signature" ) @@ -84,7 +77,7 @@ func run() error { if err != nil { return fmt.Errorf("failed to fetch X.509-SVID: %w", err) } - defer zeroPrivateKey(svid.PrivateKey) + defer signature.ZeroPrivateKey(svid.PrivateKey) spiffeID := svid.ID.String() logJSON("info", "fetched SVID", "spiffe_id", spiffeID) @@ -99,7 +92,7 @@ func run() error { return fmt.Errorf("failed to parse unsigned card JSON: %w", err) } - signedCard, err := signCard(&cardData, svid.PrivateKey, svid.Certificates) + signedCard, err := signature.SignCard(&cardData, svid.PrivateKey, svid.Certificates) if err != nil { return fmt.Errorf("signing failed: %w", err) } @@ -148,7 +141,7 @@ func writeConfigMapWithClient( ctx context.Context, clientset k8sclient.Interface, agentName, namespace string, signedCard []byte, ) error { - cmName := agentName + "-card-signed" + cmName := agentcard.ConfigMapName(agentName) cm := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{Name: cmName, Namespace: namespace}, Data: map[string]string{"agent-card.json": string(signedCard)}, @@ -171,7 +164,7 @@ func fetchSVID(ctx context.Context, socketPath string) (*x509svid.SVID, error) { if err != nil { return nil, fmt.Errorf("failed to create workload API client: %w", err) } - defer client.Close() + defer client.Close() //nolint:errcheck // best-effort cleanup svid, err := client.FetchX509SVID(ctx) if err != nil { @@ -180,161 +173,6 @@ func fetchSVID(ctx context.Context, socketPath string) (*x509svid.SVID, error) { return svid, nil } -// signCard signs AgentCard data and returns the signed JSON. -func signCard(cardData *agentv1alpha1.AgentCardData, privateKey crypto.Signer, certs []*x509.Certificate) ([]byte, error) { - if cardData == nil { - return nil, fmt.Errorf("card data is nil") - } - if len(certs) == 0 { - return nil, fmt.Errorf("no certificates in SVID chain") - } - leaf := certs[0] - - alg, err := algorithmForKey(privateKey.Public()) - if err != nil { - return nil, err - } - - kid := computeKID(leaf) - - x5c := make([]string, len(certs)) - for i, cert := range certs { - x5c[i] = base64.StdEncoding.EncodeToString(cert.Raw) - } - - header := &signature.ProtectedHeader{ - Algorithm: alg, - KeyID: kid, - Type: "JOSE", - X5C: x5c, - } - - protectedB64, err := signature.EncodeProtectedHeader(header) - if err != nil { - return nil, fmt.Errorf("failed to encode protected header: %w", err) - } - - payload, err := signature.CreateCanonicalCardJSON(cardData) - if err != nil { - return nil, fmt.Errorf("failed to create canonical JSON: %w", err) - } - - payloadB64 := base64.RawURLEncoding.EncodeToString(payload) - signingInput := []byte(protectedB64 + "." + payloadB64) - - sigBytes, err := signInput(privateKey, alg, signingInput) - if err != nil { - return nil, fmt.Errorf("signing failed: %w", err) - } - - sigB64 := base64.RawURLEncoding.EncodeToString(sigBytes) - - cardData.Signatures = []agentv1alpha1.AgentCardSignature{ - { - Protected: protectedB64, - Signature: sigB64, - }, - } - - output, err := json.MarshalIndent(cardData, "", " ") - if err != nil { - return nil, fmt.Errorf("failed to marshal signed card: %w", err) - } - - return output, nil -} - -// algorithmForKey maps a public key type to its JWS algorithm. -func algorithmForKey(pub crypto.PublicKey) (string, error) { - switch k := pub.(type) { - case *rsa.PublicKey: - if k.N.BitLen() < 2048 { - return "", fmt.Errorf("RSA key too small: %d bits (minimum 2048)", k.N.BitLen()) - } - return "RS256", nil - case *ecdsa.PublicKey: - switch k.Curve { - case elliptic.P256(): - return "ES256", nil - case elliptic.P384(): - return "ES384", nil - case elliptic.P521(): - return "ES512", nil - default: - return "", fmt.Errorf("unsupported ECDSA curve: %s", k.Curve.Params().Name) - } - default: - return "", fmt.Errorf("unsupported key type: %T", pub) - } -} - -// computeKID derives a key ID from the leaf cert's SHA-256 fingerprint (first 8 bytes). -func computeKID(leaf *x509.Certificate) string { - fp := sha256.Sum256(leaf.Raw) - return fmt.Sprintf("%x", fp[:8]) -} - -func signInput(signer crypto.Signer, alg string, input []byte) ([]byte, error) { - hashFunc, err := signature.HashForAlgorithm(alg) - if err != nil { - return nil, err - } - - h := hashFunc.New() - h.Write(input) - hashed := h.Sum(nil) - - switch alg { - case "RS256", "RS384", "RS512": - return signer.Sign(rand.Reader, hashed, hashFunc) - case "ES256", "ES384", "ES512": - return signECDSARaw(signer, hashed, alg) - default: - return nil, fmt.Errorf("unsupported algorithm: %s", alg) - } -} - -// signECDSARaw signs with ECDSA and encodes as fixed-width R||S (RFC 7518 §3.4). -func signECDSARaw(signer crypto.Signer, hashed []byte, alg string) ([]byte, error) { - ecKey, ok := signer.(*ecdsa.PrivateKey) - if !ok { - return nil, fmt.Errorf("expected *ecdsa.PrivateKey, got %T", signer) - } - - r, s, err := ecdsa.Sign(rand.Reader, ecKey, hashed) - if err != nil { - return nil, fmt.Errorf("ECDSA sign failed: %w", err) - } - - keySize := signature.CurveByteSize(ecKey.Curve) - sig := make([]byte, 2*keySize) - rBytes := r.Bytes() - sBytes := s.Bytes() - copy(sig[keySize-len(rBytes):keySize], rBytes) - copy(sig[2*keySize-len(sBytes):], sBytes) - - return sig, nil -} - -// zeroPrivateKey zeroes private key material in memory (best-effort). -func zeroPrivateKey(key crypto.Signer) { - switch k := key.(type) { - case *ecdsa.PrivateKey: - if k.D != nil { - k.D.SetInt64(0) - } - case *rsa.PrivateKey: - if k.D != nil { - k.D.SetInt64(0) - } - for _, p := range k.Primes { - if p != nil { - p.SetInt64(0) - } - } - } -} - func envOrDefault(key, defaultVal string) string { if v := os.Getenv(key); v != "" { return v diff --git a/kagenti-operator/cmd/agentcard-signer/main_test.go b/kagenti-operator/cmd/agentcard-signer/main_test.go index 551efcaf..e2ff3354 100644 --- a/kagenti-operator/cmd/agentcard-signer/main_test.go +++ b/kagenti-operator/cmd/agentcard-signer/main_test.go @@ -106,7 +106,7 @@ func testCard() *agentv1alpha1.AgentCardData { } } -// --- signCard tests --- +// --- SignCard tests (via shared library) --- func TestSignCard_ECDSA_P256(t *testing.T) { ca := newTestCA(t) @@ -114,9 +114,9 @@ func TestSignCard_ECDSA_P256(t *testing.T) { leaf, _ := ca.issueLeaf(t, &key.PublicKey, "spiffe://example.org/ns/default/sa/test") card := testCard() - output, err := signCard(card, key, []*x509.Certificate{leaf, ca.Cert}) + output, err := signature.SignCard(card, key, []*x509.Certificate{leaf, ca.Cert}) if err != nil { - t.Fatalf("signCard failed: %v", err) + t.Fatalf("SignCard failed: %v", err) } var parsed agentv1alpha1.AgentCardData @@ -148,9 +148,9 @@ func TestSignCard_ECDSA_P384(t *testing.T) { leaf, _ := ca.issueLeaf(t, &key.PublicKey, "spiffe://example.org/agent") card := testCard() - output, err := signCard(card, key, []*x509.Certificate{leaf}) + output, err := signature.SignCard(card, key, []*x509.Certificate{leaf}) if err != nil { - t.Fatalf("signCard failed: %v", err) + t.Fatalf("SignCard failed: %v", err) } var parsed agentv1alpha1.AgentCardData @@ -170,9 +170,9 @@ func TestSignCard_RSA(t *testing.T) { leaf, _ := ca.issueLeaf(t, &key.PublicKey, "spiffe://example.org/rsa-agent") card := testCard() - output, err := signCard(card, key, []*x509.Certificate{leaf}) + output, err := signature.SignCard(card, key, []*x509.Certificate{leaf}) if err != nil { - t.Fatalf("signCard failed: %v", err) + t.Fatalf("SignCard failed: %v", err) } var parsed agentv1alpha1.AgentCardData @@ -188,7 +188,7 @@ func TestSignCard_RSA(t *testing.T) { func TestSignCard_NilCardData(t *testing.T) { key, _ := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) - _, err := signCard(nil, key, []*x509.Certificate{{}}) + _, err := signature.SignCard(nil, key, []*x509.Certificate{{}}) if err == nil { t.Error("expected error for nil card data") } @@ -196,7 +196,7 @@ func TestSignCard_NilCardData(t *testing.T) { func TestSignCard_NoCertificates(t *testing.T) { key, _ := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) - _, err := signCard(testCard(), key, nil) + _, err := signature.SignCard(testCard(), key, nil) if err == nil { t.Error("expected error for empty cert chain") } @@ -210,20 +210,21 @@ func TestSignCard_ECDSA_RawRS_ByteLength(t *testing.T) { leaf, _ := ca.issueLeaf(t, &key.PublicKey, "spiffe://example.org/agent") card := testCard() - output, err := signCard(card, key, []*x509.Certificate{leaf}) + output, err := signature.SignCard(card, key, []*x509.Certificate{leaf}) if err != nil { - t.Fatalf("signCard failed: %v", err) + t.Fatalf("SignCard failed: %v", err) } var parsed agentv1alpha1.AgentCardData - json.Unmarshal(output, &parsed) + if err := json.Unmarshal(output, &parsed); err != nil { + t.Fatalf("failed to unmarshal signed card: %v", err) + } sigBytes, err := base64.RawURLEncoding.DecodeString(parsed.Signatures[0].Signature) if err != nil { t.Fatalf("failed to decode signature: %v", err) } - // ES256 raw R||S must be exactly 64 bytes (32 + 32) if len(sigBytes) != 64 { t.Errorf("ES256 raw R||S signature must be 64 bytes, got %d (likely DER-encoded)", len(sigBytes)) } @@ -235,16 +236,17 @@ func TestSignCard_ECDSA_P384_RawRS_ByteLength(t *testing.T) { leaf, _ := ca.issueLeaf(t, &key.PublicKey, "spiffe://example.org/agent") card := testCard() - output, err := signCard(card, key, []*x509.Certificate{leaf}) + output, err := signature.SignCard(card, key, []*x509.Certificate{leaf}) if err != nil { - t.Fatalf("signCard failed: %v", err) + t.Fatalf("SignCard failed: %v", err) } var parsed agentv1alpha1.AgentCardData - json.Unmarshal(output, &parsed) + if err := json.Unmarshal(output, &parsed); err != nil { + t.Fatalf("failed to unmarshal signed card: %v", err) + } sigBytes, _ := base64.RawURLEncoding.DecodeString(parsed.Signatures[0].Signature) - // ES384 raw R||S must be exactly 96 bytes (48 + 48) if len(sigBytes) != 96 { t.Errorf("ES384 raw R||S signature must be 96 bytes, got %d", len(sigBytes)) } @@ -258,14 +260,15 @@ func TestSignCard_X5C_StandardBase64(t *testing.T) { leaf, leafDER := ca.issueLeaf(t, &key.PublicKey, "spiffe://example.org/agent") card := testCard() - output, _ := signCard(card, key, []*x509.Certificate{leaf, ca.Cert}) + output, _ := signature.SignCard(card, key, []*x509.Certificate{leaf, ca.Cert}) var parsed agentv1alpha1.AgentCardData - json.Unmarshal(output, &parsed) + if err := json.Unmarshal(output, &parsed); err != nil { + t.Fatalf("failed to unmarshal signed card: %v", err) + } header, _ := signature.DecodeProtectedHeader(parsed.Signatures[0].Protected) - // x5c must use standard base64 (not base64url) per RFC 7515 §4.1.6 decoded, err := base64.StdEncoding.DecodeString(header.X5C[0]) if err != nil { t.Fatalf("x5c[0] is not valid standard base64: %v", err) @@ -281,10 +284,12 @@ func TestSignCard_X5C_LeafFirst(t *testing.T) { leaf, _ := ca.issueLeaf(t, &key.PublicKey, "spiffe://example.org/agent") card := testCard() - output, _ := signCard(card, key, []*x509.Certificate{leaf, ca.Cert}) + output, _ := signature.SignCard(card, key, []*x509.Certificate{leaf, ca.Cert}) var parsed agentv1alpha1.AgentCardData - json.Unmarshal(output, &parsed) + if err := json.Unmarshal(output, &parsed); err != nil { + t.Fatalf("failed to unmarshal signed card: %v", err) + } header, _ := signature.DecodeProtectedHeader(parsed.Signatures[0].Protected) @@ -308,22 +313,20 @@ func TestComputeKID(t *testing.T) { key, _ := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) leaf, _ := ca.issueLeaf(t, &key.PublicKey, "spiffe://example.org/agent") - kid := computeKID(leaf) + kid := signature.ComputeKID(leaf) fp := sha256.Sum256(leaf.Raw) - expected := big.NewInt(0).SetBytes(fp[:8]).Text(16) - // kid should be first 16 hex chars of SHA-256 fingerprint + _ = big.NewInt(0).SetBytes(fp[:8]).Text(16) if len(kid) != 16 { t.Errorf("expected kid length 16, got %d: %s", len(kid), kid) } - _ = expected // format may differ in leading zeros, just check length } -// --- algorithmForKey tests --- +// --- AlgorithmForKey tests --- func TestAlgorithmForKey_ECDSA_P256(t *testing.T) { key, _ := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) - alg, err := algorithmForKey(&key.PublicKey) + alg, err := signature.AlgorithmForKey(&key.PublicKey) if err != nil { t.Fatal(err) } @@ -334,7 +337,7 @@ func TestAlgorithmForKey_ECDSA_P256(t *testing.T) { func TestAlgorithmForKey_ECDSA_P384(t *testing.T) { key, _ := ecdsa.GenerateKey(elliptic.P384(), rand.Reader) - alg, err := algorithmForKey(&key.PublicKey) + alg, err := signature.AlgorithmForKey(&key.PublicKey) if err != nil { t.Fatal(err) } @@ -345,7 +348,7 @@ func TestAlgorithmForKey_ECDSA_P384(t *testing.T) { func TestAlgorithmForKey_ECDSA_P521(t *testing.T) { key, _ := ecdsa.GenerateKey(elliptic.P521(), rand.Reader) - alg, err := algorithmForKey(&key.PublicKey) + alg, err := signature.AlgorithmForKey(&key.PublicKey) if err != nil { t.Fatal(err) } @@ -356,7 +359,7 @@ func TestAlgorithmForKey_ECDSA_P521(t *testing.T) { func TestAlgorithmForKey_RSA(t *testing.T) { key, _ := rsa.GenerateKey(rand.Reader, 2048) - alg, err := algorithmForKey(&key.PublicKey) + alg, err := signature.AlgorithmForKey(&key.PublicKey) if err != nil { t.Fatal(err) } @@ -367,17 +370,17 @@ func TestAlgorithmForKey_RSA(t *testing.T) { func TestAlgorithmForKey_RSA_TooSmall(t *testing.T) { key, _ := rsa.GenerateKey(rand.Reader, 1024) - _, err := algorithmForKey(&key.PublicKey) + _, err := signature.AlgorithmForKey(&key.PublicKey) if err == nil { t.Error("expected error for 1024-bit RSA key") } } -// --- zeroPrivateKey tests --- +// --- ZeroPrivateKey tests --- func TestZeroPrivateKey_ECDSA(t *testing.T) { key, _ := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) - zeroPrivateKey(key) + signature.ZeroPrivateKey(key) if key.D.Sign() != 0 { t.Error("expected ECDSA D to be zeroed") } @@ -385,7 +388,7 @@ func TestZeroPrivateKey_ECDSA(t *testing.T) { func TestZeroPrivateKey_RSA(t *testing.T) { key, _ := rsa.GenerateKey(rand.Reader, 2048) - zeroPrivateKey(key) + signature.ZeroPrivateKey(key) if key.D.Sign() != 0 { t.Error("expected RSA D to be zeroed") } @@ -397,8 +400,6 @@ func TestZeroPrivateKey_RSA(t *testing.T) { } // --- Canonical JSON cross-validation --- -// Signer uses signature.CreateCanonicalCardJSON -- verify the output matches -// what the verifier expects. func TestSignCard_CanonicalJSON_CrossValidation(t *testing.T) { ca := newTestCA(t) @@ -406,27 +407,20 @@ func TestSignCard_CanonicalJSON_CrossValidation(t *testing.T) { leaf, _ := ca.issueLeaf(t, &key.PublicKey, "spiffe://example.org/agent") card := testCard() - output, err := signCard(card, key, []*x509.Certificate{leaf}) + output, err := signature.SignCard(card, key, []*x509.Certificate{leaf}) if err != nil { - t.Fatalf("signCard failed: %v", err) + t.Fatalf("SignCard failed: %v", err) } var parsed agentv1alpha1.AgentCardData - json.Unmarshal(output, &parsed) + if err := json.Unmarshal(output, &parsed); err != nil { + t.Fatalf("failed to unmarshal signed card: %v", err) + } - // Re-derive the canonical JSON from the parsed card (without signatures) cardWithoutSigs := parsed cardWithoutSigs.Signatures = nil - canonical, err := signature.CreateCanonicalCardJSON(&cardWithoutSigs) - if err != nil { - t.Fatalf("CreateCanonicalCardJSON failed: %v", err) - } - // Reconstruct the signing input and verify the signature sig := parsed.Signatures[0] - payloadB64 := base64.RawURLEncoding.EncodeToString(canonical) - signingInput := sig.Protected + "." + payloadB64 - pubPEM, _ := signature.MarshalPublicKeyToPEM(&key.PublicKey) result, err := signature.VerifyJWS(&cardWithoutSigs, &sig, pubPEM) if err != nil { @@ -435,7 +429,6 @@ func TestSignCard_CanonicalJSON_CrossValidation(t *testing.T) { if !result.Verified { t.Errorf("cross-validation failed: signer output not verified by VerifyJWS: %s", result.Details) } - _ = signingInput } // --- End-to-end: signer output verified by X5CProvider --- @@ -446,15 +439,16 @@ func TestSignCard_VerifiedByX5CProvider(t *testing.T) { leaf, _ := ca.issueLeaf(t, &key.PublicKey, "spiffe://example.org/ns/default/sa/test") card := testCard() - output, err := signCard(card, key, []*x509.Certificate{leaf, ca.Cert}) + output, err := signature.SignCard(card, key, []*x509.Certificate{leaf, ca.Cert}) if err != nil { - t.Fatalf("signCard failed: %v", err) + t.Fatalf("SignCard failed: %v", err) } var parsed agentv1alpha1.AgentCardData - json.Unmarshal(output, &parsed) + if err := json.Unmarshal(output, &parsed); err != nil { + t.Fatalf("failed to unmarshal signed card: %v", err) + } - // Build an X5CProvider with the test CA pool := x509.NewCertPool() pool.AddCert(ca.Cert) provider := &signature.X5CProvider{} @@ -477,7 +471,7 @@ func TestSignCard_VerifiedByX5CProvider(t *testing.T) { // --- writeConfigMap tests --- func TestWriteConfigMapWithClient_Create(t *testing.T) { - fakeClient := k8sfake.NewSimpleClientset() + fakeClient := k8sfake.NewSimpleClientset() //nolint:staticcheck // NewClientset requires generated apply configs cardJSON := []byte(`{"name":"test-agent","version":"1.0"}`) err := writeConfigMapWithClient(context.Background(), fakeClient, "my-agent", "test-ns", cardJSON) @@ -500,6 +494,7 @@ func TestWriteConfigMapWithClient_Update(t *testing.T) { ObjectMeta: metav1.ObjectMeta{Name: "my-agent-card-signed", Namespace: "test-ns"}, Data: map[string]string{"agent-card.json": `{"name":"old"}`}, } + //nolint:staticcheck // NewClientset requires generated apply configs fakeClient := k8sfake.NewSimpleClientset(existing) newCardJSON := []byte(`{"name":"updated-agent","version":"2.0"}`) @@ -523,7 +518,4 @@ func TestWriteConfigMap_MissingEnvVars(t *testing.T) { if err == nil { t.Fatal("expected error when env vars are missing") } - if testing.Verbose() { - t.Logf("writeConfigMap error: %v", err) - } } diff --git a/kagenti-operator/internal/agentcard/writer.go b/kagenti-operator/internal/agentcard/writer.go new file mode 100644 index 00000000..3e93c4cf --- /dev/null +++ b/kagenti-operator/internal/agentcard/writer.go @@ -0,0 +1,99 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package agentcard + +import ( + "context" + "fmt" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + + agentv1alpha1 "github.com/kagenti/operator/api/v1alpha1" +) + +var writerLogger = ctrl.Log.WithName("agentcard").WithName("writer") + +// Writer creates or updates signed AgentCard ConfigMaps. +type Writer struct { + client client.Client + reader client.Reader + scheme *runtime.Scheme +} + +// NewWriter creates a Writer for signed AgentCard ConfigMaps. +// The reader should be an uncached API reader (mgr.GetAPIReader()) to avoid +// cache-scoping issues when the ConfigMap is in a different namespace. +func NewWriter(c client.Client, reader client.Reader, scheme *runtime.Scheme) *Writer { + return &Writer{client: c, reader: reader, scheme: scheme} +} + +// ConfigMapName returns the canonical ConfigMap name for a signed agent card. +func ConfigMapName(agentName string) string { + return agentName + SignedCardConfigMapSuffix +} + +// WriteSignedCard creates or updates the signed card ConfigMap, setting an +// owner reference to the AgentCard CR for automatic garbage collection. +func (w *Writer) WriteSignedCard(ctx context.Context, owner *agentv1alpha1.AgentCard, agentName, namespace string, signedCard []byte) error { + cmName := ConfigMapName(agentName) + + cm := &corev1.ConfigMap{} + key := types.NamespacedName{Name: cmName, Namespace: namespace} + err := w.reader.Get(ctx, key, cm) + + if apierrors.IsNotFound(err) { + cm = &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: cmName, + Namespace: namespace, + }, + Data: map[string]string{ + SignedCardConfigMapKey: string(signedCard), + }, + } + if err := controllerutil.SetControllerReference(owner, cm, w.scheme); err != nil { + return fmt.Errorf("failed to set owner reference on ConfigMap %s: %w", cmName, err) + } + if err := w.client.Create(ctx, cm); err != nil { + return fmt.Errorf("failed to create ConfigMap %s: %w", cmName, err) + } + writerLogger.Info("Created signed card ConfigMap", "configMap", cmName, "namespace", namespace) + return nil + } else if err != nil { + return fmt.Errorf("failed to get ConfigMap %s: %w", cmName, err) + } + + cm.Data = map[string]string{ + SignedCardConfigMapKey: string(signedCard), + } + if err := controllerutil.SetControllerReference(owner, cm, w.scheme); err != nil { + writerLogger.V(1).Info("Could not set owner reference on existing ConfigMap (may already be owned)", + "configMap", cmName, "error", err.Error()) + } + if err := w.client.Update(ctx, cm); err != nil { + return fmt.Errorf("failed to update ConfigMap %s: %w", cmName, err) + } + writerLogger.Info("Updated signed card ConfigMap", "configMap", cmName, "namespace", namespace) + return nil +} diff --git a/kagenti-operator/internal/agentcard/writer_test.go b/kagenti-operator/internal/agentcard/writer_test.go new file mode 100644 index 00000000..f787b514 --- /dev/null +++ b/kagenti-operator/internal/agentcard/writer_test.go @@ -0,0 +1,140 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package agentcard + +import ( + "context" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + agentv1alpha1 "github.com/kagenti/operator/api/v1alpha1" +) + +func testScheme() *runtime.Scheme { + s := runtime.NewScheme() + utilruntime.Must(corev1.AddToScheme(s)) + utilruntime.Must(agentv1alpha1.AddToScheme(s)) + return s +} + +func testAgentCard() *agentv1alpha1.AgentCard { + return &agentv1alpha1.AgentCard{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-card", + Namespace: "test-ns", + UID: "test-uid-123", + }, + } +} + +func TestWriter_CreateConfigMap(t *testing.T) { + scheme := testScheme() + client := fake.NewClientBuilder().WithScheme(scheme).Build() + writer := NewWriter(client, client, scheme) + + owner := testAgentCard() + signedCard := []byte(`{"name":"test-agent","signatures":[{"protected":"xxx","signature":"yyy"}]}`) + + err := writer.WriteSignedCard(context.Background(), owner, "my-agent", "test-ns", signedCard) + if err != nil { + t.Fatalf("WriteSignedCard failed: %v", err) + } + + cm := &corev1.ConfigMap{} + err = client.Get(context.Background(), types.NamespacedName{ + Name: "my-agent-card-signed", + Namespace: "test-ns", + }, cm) + if err != nil { + t.Fatalf("ConfigMap not created: %v", err) + } + + if cm.Data[SignedCardConfigMapKey] != string(signedCard) { + t.Errorf("ConfigMap data mismatch: got %q", cm.Data[SignedCardConfigMapKey]) + } +} + +func TestWriter_UpdateConfigMap(t *testing.T) { + scheme := testScheme() + existing := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-agent-card-signed", + Namespace: "test-ns", + }, + Data: map[string]string{SignedCardConfigMapKey: `{"name":"old"}`}, + } + client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existing).Build() + writer := NewWriter(client, client, scheme) + + owner := testAgentCard() + newCard := []byte(`{"name":"updated"}`) + + err := writer.WriteSignedCard(context.Background(), owner, "my-agent", "test-ns", newCard) + if err != nil { + t.Fatalf("WriteSignedCard update failed: %v", err) + } + + cm := &corev1.ConfigMap{} + if err = client.Get(context.Background(), types.NamespacedName{ + Name: "my-agent-card-signed", + Namespace: "test-ns", + }, cm); err != nil { + t.Fatalf("ConfigMap not found after update: %v", err) + } + + if cm.Data[SignedCardConfigMapKey] != string(newCard) { + t.Errorf("ConfigMap not updated: got %q", cm.Data[SignedCardConfigMapKey]) + } +} + +func TestWriter_OwnerReference(t *testing.T) { + scheme := testScheme() + client := fake.NewClientBuilder().WithScheme(scheme).Build() + writer := NewWriter(client, client, scheme) + + owner := testAgentCard() + err := writer.WriteSignedCard(context.Background(), owner, "my-agent", "test-ns", []byte(`{}`)) + if err != nil { + t.Fatalf("WriteSignedCard failed: %v", err) + } + + cm := &corev1.ConfigMap{} + if err = client.Get(context.Background(), types.NamespacedName{ + Name: "my-agent-card-signed", + Namespace: "test-ns", + }, cm); err != nil { + t.Fatalf("ConfigMap not found: %v", err) + } + + if len(cm.OwnerReferences) == 0 { + t.Error("expected owner reference on ConfigMap") + } else { + ownerRef := cm.OwnerReferences[0] + if ownerRef.Name != "test-card" { + t.Errorf("expected owner name 'test-card', got %q", ownerRef.Name) + } + if ownerRef.UID != "test-uid-123" { + t.Errorf("expected owner UID 'test-uid-123', got %q", ownerRef.UID) + } + } +} From 59de614b3c16ff159cfdd8cd18c74b6327baf69c Mon Sep 17 00:00:00 2001 From: Kevin Cogan Date: Tue, 14 Apr 2026 15:31:37 +0100 Subject: [PATCH 03/16] feat(operator): enable operator-side AgentCard signing with SPIFFE identity Wire the SpiffeSigner and CardWriter into the AgentCard reconciler behind the --enable-operator-signing flag (off by default). When enabled, the operator signs cards with its own SVID, writes signed ConfigMaps, and heals tampered signatures on reconcile. Add Helm values, SPIRE CSI volume mount, ClusterSPIFFEID template, and expanded ConfigMap RBAC. Signed-off-by: Kevin Cogan Signed-off-by: Kevin Cogan --- .../templates/manager/clusterspiffeid.yaml | 17 ++ .../templates/manager/manager.yaml | 15 ++ .../kagenti-operator/templates/rbac/role.yaml | 10 + charts/kagenti-operator/values.yaml | 6 + kagenti-operator/cmd/main.go | 34 ++- .../controller/agentcard_controller.go | 54 ++++- .../controller/agentcard_controller_test.go | 225 ++++++++++++++++++ 7 files changed, 354 insertions(+), 7 deletions(-) create mode 100644 charts/kagenti-operator/templates/manager/clusterspiffeid.yaml diff --git a/charts/kagenti-operator/templates/manager/clusterspiffeid.yaml b/charts/kagenti-operator/templates/manager/clusterspiffeid.yaml new file mode 100644 index 00000000..c4e38e76 --- /dev/null +++ b/charts/kagenti-operator/templates/manager/clusterspiffeid.yaml @@ -0,0 +1,17 @@ +{{- if .Values.operatorSigning.enabled }} +apiVersion: spire.spiffe.io/v1alpha1 +kind: ClusterSPIFFEID +metadata: + name: kagenti-operator + labels: + {{- include "chart.labels" . | nindent 4 }} +spec: + spiffeIDTemplate: "spiffe://{{ "{{ .TrustDomain }}" }}/ns/{{ "{{ .PodMeta.Namespace }}" }}/sa/{{ "{{ .PodSpec.ServiceAccountName }}" }}" + podSelector: + matchLabels: + app.kubernetes.io/name: {{ include "chart.name" . }} + control-plane: controller-manager + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: {{ .Release.Namespace }} +{{- end }} diff --git a/charts/kagenti-operator/templates/manager/manager.yaml b/charts/kagenti-operator/templates/manager/manager.yaml index 787b1b27..f9483868 100644 --- a/charts/kagenti-operator/templates/manager/manager.yaml +++ b/charts/kagenti-operator/templates/manager/manager.yaml @@ -34,6 +34,10 @@ spec: {{- if .Values.mlflow.enable }} - "--enable-mlflow=true" {{- end }} + {{- if .Values.operatorSigning.enabled }} + - "--enable-operator-signing=true" + - "--spiffe-endpoint-socket={{ .Values.operatorSigning.spiffeEndpointSocket }}" + {{- end }} {{- if .Values.signatureVerification.enabled }} - "--require-a2a-signature=true" {{- if .Values.signatureVerification.auditMode }} @@ -105,6 +109,11 @@ spec: mountPath: /tmp/k8s-metrics-server/metrics-certs readOnly: true {{- end }} + {{- if .Values.operatorSigning.enabled }} + - name: spiffe-workload-api + mountPath: /spiffe-workload-api + readOnly: true + {{- end }} securityContext: {{- toYaml .Values.controllerManager.securityContext | nindent 8 }} serviceAccountName: {{ .Values.controllerManager.serviceAccountName }} @@ -128,3 +137,9 @@ spec: secret: secretName: kagenti-operator-metrics-server-cert {{- end }} + {{- if .Values.operatorSigning.enabled }} + - name: spiffe-workload-api + csi: + driver: "csi.spiffe.io" + readOnly: true + {{- end }} diff --git a/charts/kagenti-operator/templates/rbac/role.yaml b/charts/kagenti-operator/templates/rbac/role.yaml index 084b683b..edb39b7e 100755 --- a/charts/kagenti-operator/templates/rbac/role.yaml +++ b/charts/kagenti-operator/templates/rbac/role.yaml @@ -9,6 +9,16 @@ rules: - "" resources: - configmaps + verbs: + - create + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: - namespaces - services verbs: diff --git a/charts/kagenti-operator/values.yaml b/charts/kagenti-operator/values.yaml index 3052922e..539db51c 100644 --- a/charts/kagenti-operator/values.yaml +++ b/charts/kagenti-operator/values.yaml @@ -92,6 +92,12 @@ networkPolicy: mlflow: enable: false +# [OPERATOR SIGNING]: Operator signs agent cards with its own SPIFFE identity +# Replaces the init-container self-signing model (Phase 2A of Sigstore RFC) +operatorSigning: + enabled: false + spiffeEndpointSocket: "unix:///spiffe-workload-api/spire-agent.sock" + # [SIGNATURE VERIFICATION]: A2A agent card signature verification via SPIRE x5c signatureVerification: # Enable signature verification for agent cards diff --git a/kagenti-operator/cmd/main.go b/kagenti-operator/cmd/main.go index fded58d5..45cc30d3 100644 --- a/kagenti-operator/cmd/main.go +++ b/kagenti-operator/cmd/main.go @@ -88,6 +88,9 @@ func main() { var enableOperatorClientRegistration bool var enableMLflow bool + var enableOperatorSigning bool + var spiffeEndpointSocket string + var spireTrustDomain string var spireTrustBundleConfigMapName string var spireTrustBundleConfigMapNS string @@ -129,6 +132,11 @@ func main() { flag.BoolVar(&enableMLflow, "enable-mlflow", false, "Enable MLflow experiment tracking integration") + flag.BoolVar(&enableOperatorSigning, "enable-operator-signing", false, + "Operator signs agent cards with its own SPIFFE identity instead of the init-container") + flag.StringVar(&spiffeEndpointSocket, "spiffe-endpoint-socket", "unix:///spiffe-workload-api/spire-agent.sock", + "SPIFFE Workload API socket path for operator signing") + flag.StringVar(&spireTrustDomain, "spire-trust-domain", "", "SPIRE trust domain for identity binding (e.g. 'example.org')") flag.StringVar(&spireTrustBundleConfigMapName, "spire-trust-bundle-configmap", "", @@ -307,12 +315,14 @@ func main() { var sigProvider signature.Provider if requireA2ASignature { if spireTrustDomain == "" { - setupLog.Error(errors.New("missing required flag"), "--spire-trust-domain is required when --require-a2a-signature=true") + setupLog.Error(errors.New("missing required flag"), + "--spire-trust-domain is required when --require-a2a-signature=true") os.Exit(1) } if spireTrustBundleConfigMapName == "" || spireTrustBundleConfigMapNS == "" { setupLog.Error(errors.New("missing required flags"), - "--spire-trust-bundle-configmap and --spire-trust-bundle-configmap-namespace are required when --require-a2a-signature=true") + "--spire-trust-bundle-configmap and --spire-trust-bundle-configmap-namespace "+ + "are required when --require-a2a-signature=true") os.Exit(1) } @@ -337,7 +347,7 @@ func main() { "auditMode", signatureAuditMode) } - if err = (&controller.AgentCardReconciler{ + agentCardReconciler := &controller.AgentCardReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), Recorder: mgr.GetEventRecorderFor("agentcard-controller"), @@ -347,7 +357,23 @@ func main() { SignatureAuditMode: signatureAuditMode, SpireTrustDomain: spireTrustDomain, SVIDExpiryGracePeriod: svidExpiryGracePeriod, - }).SetupWithManager(mgr); err != nil { + EnableOperatorSign: enableOperatorSigning, + } + + if enableOperatorSigning { + cardSigner, signerErr := signature.NewSpiffeSigner(ctx, spiffeEndpointSocket) + if signerErr != nil { + setupLog.Error(signerErr, "unable to create SPIFFE signer for operator signing") + os.Exit(1) + } + defer cardSigner.Close() //nolint:errcheck // best-effort cleanup on shutdown + agentCardReconciler.CardSigner = cardSigner + agentCardReconciler.CardWriter = agentcard.NewWriter(mgr.GetClient(), mgr.GetAPIReader(), mgr.GetScheme()) + setupLog.Info("Operator AgentCard signing enabled", + "spiffeEndpointSocket", spiffeEndpointSocket) + } + + if err = agentCardReconciler.SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "AgentCard") os.Exit(1) } diff --git a/kagenti-operator/internal/controller/agentcard_controller.go b/kagenti-operator/internal/controller/agentcard_controller.go index 63b2a12a..2e73a22f 100644 --- a/kagenti-operator/internal/controller/agentcard_controller.go +++ b/kagenti-operator/internal/controller/agentcard_controller.go @@ -126,6 +126,16 @@ type AgentCardReconciler struct { // SVIDExpiryGracePeriod controls how far before the leaf cert expires the operator // triggers a proactive workload restart. Defaults to DefaultSVIDExpiryGracePeriod. SVIDExpiryGracePeriod time.Duration + + // CardSigner signs agent cards with the operator's SPIFFE identity. + // Nil when operator signing is disabled. + CardSigner signature.Signer + + // EnableOperatorSign gates the operator signing code path. + EnableOperatorSign bool + + // CardWriter writes signed cards to ConfigMaps. Nil when operator signing is disabled. + CardWriter *agentcard.Writer } // +kubebuilder:rbac:groups=agent.kagenti.dev,resources=agentcards,verbs=get;list;watch;create;update;patch;delete @@ -134,7 +144,7 @@ type AgentCardReconciler struct { // +kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch // +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;update;patch // +kubebuilder:rbac:groups=apps,resources=statefulsets,verbs=get;list;watch;update;patch -// +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch +// +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch;create;update;patch func (r *AgentCardReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { agentCardLogger.V(1).Info("Reconciling AgentCard", "namespacedName", req.NamespacedName) @@ -229,6 +239,39 @@ func (r *AgentCardReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{RequeueAfter: 1 * time.Minute}, nil } + if r.EnableOperatorSign && r.CardSigner != nil { + if !r.CardSigner.Ready() { + agentCardLogger.Info("Operator signer not ready, requeueing", "workload", workload.Name) + if condErr := r.updateCondition(ctx, agentCard, "Synced", metav1.ConditionFalse, "SignerNotReady", + "Operator SPIFFE signer has not yet obtained an SVID"); condErr != nil { + return ctrl.Result{}, condErr + } + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + } + + signedBytes, signErr := r.CardSigner.SignCard(ctx, cardData) + if signErr != nil { + agentCardLogger.Error(signErr, "Operator signing failed", "workload", workload.Name) + if condErr := r.updateCondition(ctx, agentCard, "Synced", metav1.ConditionFalse, "SigningFailed", signErr.Error()); condErr != nil { + return ctrl.Result{}, condErr + } + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil + } + + if r.CardWriter != nil { + if writeErr := r.CardWriter.WriteSignedCard(ctx, agentCard, workload.ServiceName, agentCard.Namespace, signedBytes); writeErr != nil { + agentCardLogger.Error(writeErr, "Failed to write signed card ConfigMap (non-fatal)", "workload", workload.Name) + if r.Recorder != nil { + r.Recorder.Event(agentCard, corev1.EventTypeWarning, "ConfigMapWriteFailed", writeErr.Error()) + } + } + } + + agentCardLogger.Info("Operator signed agent card", + "workload", workload.Name, + "spiffeID", r.CardSigner.SpiffeID()) + } + var verificationResult *signature.VerificationResult if r.RequireSignature { var verifyErr error @@ -1108,9 +1151,14 @@ func (r *AgentCardReconciler) computeCardId(cardData *agentv1alpha1.AgentCardDat // 1. The leaf SVID cert is approaching expiry (within SVIDExpiryGracePeriod). // 2. The trust bundle hash changed since the workload was last (re)started. // -// Both feed into the same mechanism: patch the pod template annotation to trigger a rollout. -// The init-container re-runs, fetches a fresh SVID, and re-signs the card. +// When operator signing is enabled, the operator re-signs on each reconcile using its +// auto-rotating X509Source, so pod restarts for re-signing are unnecessary. +// In legacy init-container mode, the pod template annotation is patched to trigger +// a rollout so the init-container re-runs and fetches a fresh SVID. func (r *AgentCardReconciler) maybeRestartForResign(ctx context.Context, agentCard *agentv1alpha1.AgentCard, workload *WorkloadInfo, vr *signature.VerificationResult) { + if r.EnableOperatorSign { + return + } if workload == nil || r.SignatureProvider == nil { return } diff --git a/kagenti-operator/internal/controller/agentcard_controller_test.go b/kagenti-operator/internal/controller/agentcard_controller_test.go index db120deb..4d0a0d87 100644 --- a/kagenti-operator/internal/controller/agentcard_controller_test.go +++ b/kagenti-operator/internal/controller/agentcard_controller_test.go @@ -36,6 +36,44 @@ import ( "github.com/kagenti/operator/internal/agentcard" ) +// mockSigner implements signature.Signer for testing operator signing path. +type mockSigner struct { + mu sync.Mutex + signErr error + signedOut []byte + ready bool + spiffeID string + signCount int +} + +func (m *mockSigner) SignCard(_ context.Context, cardData *agentv1alpha1.AgentCardData) ([]byte, error) { + m.mu.Lock() + defer m.mu.Unlock() + m.signCount++ + if m.signErr != nil { + return nil, m.signErr + } + if m.signedOut != nil { + return m.signedOut, nil + } + out, _ := json.Marshal(cardData) + return out, nil +} + +func (m *mockSigner) SpiffeID() string { + m.mu.Lock() + defer m.mu.Unlock() + return m.spiffeID +} + +func (m *mockSigner) Ready() bool { + m.mu.Lock() + defer m.mu.Unlock() + return m.ready +} + +func (m *mockSigner) Close() error { return nil } + // mockFetcher implements agentcard.Fetcher for testing type mockFetcher struct { cardData *agentv1alpha1.AgentCardData @@ -1724,6 +1762,193 @@ var _ = Describe("AgentCard Controller - getSyncPeriod", func() { }) }) +var _ = Describe("AgentCard Operator Signing", func() { + const ( + opsDeployName = "opsign-agent" + opsCardName = "opsign-card" + opsSvcName = "opsign-agent" + namespace = "default" + ) + + ctx := context.Background() + nn := types.NamespacedName{Name: opsCardName, Namespace: namespace} + + BeforeEach(func() { + By("creating a Deployment with agent labels") + replicas := int32(1) + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: opsDeployName, + Namespace: namespace, + Labels: map[string]string{ + LabelAgentType: LabelValueAgent, + ProtocolLabelPrefix + "a2a": "", + }, + }, + Spec: appsv1.DeploymentSpec{ + Replicas: &replicas, + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": opsDeployName}}, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{ + "app": opsDeployName, + LabelAgentType: LabelValueAgent, + ProtocolLabelPrefix + "a2a": "", + }}, + Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: "agent", Image: "img:latest"}}}, + }, + }, + } + Expect(k8sClient.Create(ctx, dep)).To(Succeed()) + + By("setting Deployment status to Available") + Eventually(func() error { + if err := k8sClient.Get(ctx, types.NamespacedName{Name: opsDeployName, Namespace: namespace}, dep); err != nil { + return err + } + dep.Status.Conditions = []appsv1.DeploymentCondition{ + {Type: appsv1.DeploymentAvailable, Status: corev1.ConditionTrue}, + } + return k8sClient.Status().Update(ctx, dep) + }).Should(Succeed()) + + By("creating a Service for the Deployment") + svc := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{Name: opsSvcName, Namespace: namespace}, + Spec: corev1.ServiceSpec{Ports: []corev1.ServicePort{{Port: 8000}}}, + } + Expect(k8sClient.Create(ctx, svc)).To(Succeed()) + + By("creating an AgentCard with targetRef") + card := &agentv1alpha1.AgentCard{ + ObjectMeta: metav1.ObjectMeta{Name: opsCardName, Namespace: namespace}, + Spec: agentv1alpha1.AgentCardSpec{ + SyncPeriod: "30s", + TargetRef: &agentv1alpha1.TargetRef{APIVersion: "apps/v1", Kind: "Deployment", Name: opsDeployName}, + }, + } + Expect(k8sClient.Create(ctx, card)).To(Succeed()) + }) + + AfterEach(func() { + By("cleaning up resources") + ac := &agentv1alpha1.AgentCard{} + if err := k8sClient.Get(ctx, nn, ac); err == nil { + ac.Finalizers = nil + _ = k8sClient.Update(ctx, ac) + _ = k8sClient.Delete(ctx, ac) + } + dep := &appsv1.Deployment{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: opsDeployName, Namespace: namespace}, dep); err == nil { + Expect(k8sClient.Delete(ctx, dep)).To(Succeed()) + } + svc := &corev1.Service{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: opsSvcName, Namespace: namespace}, svc); err == nil { + Expect(k8sClient.Delete(ctx, svc)).To(Succeed()) + } + }) + + It("should skip signing when operator signing disabled", func() { + signer := &mockSigner{ready: true, spiffeID: "spiffe://example.org/operator"} + + reconciler := &AgentCardReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + AgentFetcher: &mockFetcher{cardData: &agentv1alpha1.AgentCardData{Name: "A", Version: "1", URL: "http://x"}}, + EnableOperatorSign: false, + CardSigner: signer, + } + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: nn}) + Expect(err).NotTo(HaveOccurred()) + + signer.mu.Lock() + count := signer.signCount + signer.mu.Unlock() + Expect(count).To(Equal(0), "signer should not be called when operator signing disabled") + }) + + It("should sign card when operator signing enabled", func() { + signer := &mockSigner{ + ready: true, + spiffeID: "spiffe://example.org/kagenti-operator", + } + + reconciler := &AgentCardReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + AgentFetcher: &mockFetcher{cardData: &agentv1alpha1.AgentCardData{Name: "OpSign", Version: "1", URL: "http://x"}}, + EnableOperatorSign: true, + CardSigner: signer, + } + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: nn}) + Expect(err).NotTo(HaveOccurred()) + _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: nn}) + Expect(err).NotTo(HaveOccurred()) + + signer.mu.Lock() + count := signer.signCount + signer.mu.Unlock() + Expect(count).To(Equal(1), "signer should be called once") + }) + + It("should requeue when signer not ready", func() { + signer := &mockSigner{ + ready: false, + spiffeID: "", + } + + reconciler := &AgentCardReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + AgentFetcher: &mockFetcher{cardData: &agentv1alpha1.AgentCardData{Name: "A", Version: "1", URL: "http://x"}}, + EnableOperatorSign: true, + CardSigner: signer, + } + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: nn}) + Expect(err).NotTo(HaveOccurred()) + result, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: nn}) + Expect(err).NotTo(HaveOccurred()) + Expect(result.RequeueAfter).To(Equal(10 * time.Second)) + + card := &agentv1alpha1.AgentCard{} + Expect(k8sClient.Get(ctx, nn, card)).To(Succeed()) + syncCond := findCondition(card.Status.Conditions, "Synced") + Expect(syncCond).NotTo(BeNil()) + Expect(syncCond.Reason).To(Equal("SignerNotReady")) + }) + + It("should handle signing failure gracefully", func() { + signer := &mockSigner{ + ready: true, + signErr: errors.New("SVID expired"), + spiffeID: "spiffe://example.org/operator", + } + + reconciler := &AgentCardReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + AgentFetcher: &mockFetcher{cardData: &agentv1alpha1.AgentCardData{Name: "A", Version: "1", URL: "http://x"}}, + EnableOperatorSign: true, + CardSigner: signer, + } + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: nn}) + Expect(err).NotTo(HaveOccurred()) + result, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: nn}) + Expect(err).NotTo(HaveOccurred()) + Expect(result.RequeueAfter).To(Equal(30 * time.Second)) + + card := &agentv1alpha1.AgentCard{} + Expect(k8sClient.Get(ctx, nn, card)).To(Succeed()) + syncCond := findCondition(card.Status.Conditions, "Synced") + Expect(syncCond).NotTo(BeNil()) + Expect(syncCond.Reason).To(Equal("SigningFailed")) + }) + +}) + // Helper function to find a condition by type func findCondition(conditions []metav1.Condition, conditionType string) *metav1.Condition { for i := range conditions { From 23f13d663c379f7c803f6c9a740b9bcc29080a9a Mon Sep 17 00:00:00 2001 From: Kevin Cogan Date: Fri, 17 Apr 2026 10:33:30 +0100 Subject: [PATCH 04/16] feat(agentcard): enforce strict mode in NetworkPolicy controller Skip NetworkPolicy creation when identity binding fails and strict mode is disabled, allowing agents with unverified bindings to retain network access. Add test coverage for strict vs non-strict behavior. Signed-off-by: Kevin Cogan --- .../agentcard_networkpolicy_controller.go | 13 ++++++++----- .../agentcard_networkpolicy_controller_test.go | 18 ++++++++++++++++-- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/kagenti-operator/internal/controller/agentcard_networkpolicy_controller.go b/kagenti-operator/internal/controller/agentcard_networkpolicy_controller.go index cfd0408c..32f1464c 100644 --- a/kagenti-operator/internal/controller/agentcard_networkpolicy_controller.go +++ b/kagenti-operator/internal/controller/agentcard_networkpolicy_controller.go @@ -49,7 +49,7 @@ type AgentCardNetworkPolicyReconciler struct { EnforceNetworkPolicies bool // KubeAPIServerCIDRs are the /32 CIDRs of the K8s API server endpoints. // Populated at startup from the "kubernetes" Endpoints in the default namespace. - // Used to allow init-container egress to the API server in restrictive policies. + // Used to allow workload egress to the API server in restrictive policies. KubeAPIServerCIDRs []string } @@ -143,6 +143,9 @@ func (r *AgentCardNetworkPolicyReconciler) manageNetworkPolicy(ctx context.Conte isVerified := false if agentCard.Spec.IdentityBinding != nil { isVerified = agentCard.Status.SignatureIdentityMatch != nil && *agentCard.Status.SignatureIdentityMatch + if !isVerified && !agentCard.Spec.IdentityBinding.Strict { + isVerified = true + } } else { isVerified = agentCard.Status.ValidSignature != nil && *agentCard.Status.ValidSignature } @@ -242,10 +245,10 @@ func (r *AgentCardNetworkPolicyReconciler) createRestrictivePolicy( } // kubeAPIEgressRules returns egress rules that allow only traffic to the -// K8s API server endpoints on port 6443. This permits init containers -// (e.g. agentcard-signer) to write ConfigMaps while blocking all other -// outbound traffic. If no API server CIDRs are configured, returns an -// empty list (deny-all egress). +// K8s API server endpoints on port 6443. This permits workloads to +// communicate with the API server while blocking all other outbound +// traffic. If no API server CIDRs are configured, returns an empty +// list (deny-all egress). func (r *AgentCardNetworkPolicyReconciler) kubeAPIEgressRules() []netv1.NetworkPolicyEgressRule { if len(r.KubeAPIServerCIDRs) == 0 { return []netv1.NetworkPolicyEgressRule{} diff --git a/kagenti-operator/internal/controller/agentcard_networkpolicy_controller_test.go b/kagenti-operator/internal/controller/agentcard_networkpolicy_controller_test.go index 57d9786b..e2bbd5b5 100644 --- a/kagenti-operator/internal/controller/agentcard_networkpolicy_controller_test.go +++ b/kagenti-operator/internal/controller/agentcard_networkpolicy_controller_test.go @@ -223,9 +223,10 @@ var _ = Describe("AgentCardNetworkPolicyReconciler", func() { Expect(policyHasVerifiedPodIngress(p)).To(BeTrue()) }) - It("should create restrictive policy when SignatureIdentityMatch=false", func() { + It("should create restrictive policy when SignatureIdentityMatch=false and strict=true", func() { + strictBinding := &agentv1alpha1.IdentityBinding{TrustDomain: "test.local", Strict: true} createDeploymentWithService(ctx, deploymentName, namespace) - createCardWithStatus(agentCardName, namespace, deploymentName, nil, ptr.To(false), binding) + createCardWithStatus(agentCardName, namespace, deploymentName, nil, ptr.To(false), strictBinding) r := newNPReconciler(true) reconcileNP(r, agentCardName, namespace) @@ -234,6 +235,19 @@ var _ = Describe("AgentCardNetworkPolicyReconciler", func() { p := getPolicy(deploymentName, namespace) Expect(p.Spec.Ingress[0].From[0].PodSelector).To(BeNil()) }) + + It("should create permissive policy when SignatureIdentityMatch=false and strict=false", func() { + createDeploymentWithService(ctx, deploymentName, namespace) + createCardWithStatus(agentCardName, namespace, deploymentName, nil, ptr.To(false), binding) + + r := newNPReconciler(true) + reconcileNP(r, agentCardName, namespace) + reconcileNP(r, agentCardName, namespace) + + p := getPolicy(deploymentName, namespace) + Expect(p.Spec.Ingress[0].From).To(HaveLen(3)) + Expect(policyHasVerifiedPodIngress(p)).To(BeTrue()) + }) }) Context("Policy updates when verification status changes", func() { From ac0c2eed3aea44a423c1dfe7de6691f9cb8c8f7b Mon Sep 17 00:00:00 2001 From: Kevin Cogan Date: Fri, 17 Apr 2026 10:33:58 +0100 Subject: [PATCH 05/16] chore: regenerate CRD, RBAC, and webhook manifests Run make manifests to reflect the new Strict field in the AgentCard CRD, updated controller RBAC markers, and webhook timeout removal. Signed-off-by: Kevin Cogan --- .../bases/agent.kagenti.dev_agentcards.yaml | 7 +++-- kagenti-operator/config/rbac/role.yaml | 28 +++++++++---------- .../config/webhook/manifests.yaml | 23 --------------- 3 files changed, 19 insertions(+), 39 deletions(-) diff --git a/kagenti-operator/config/crd/bases/agent.kagenti.dev_agentcards.yaml b/kagenti-operator/config/crd/bases/agent.kagenti.dev_agentcards.yaml index 11f366d7..21d35447 100644 --- a/kagenti-operator/config/crd/bases/agent.kagenti.dev_agentcards.yaml +++ b/kagenti-operator/config/crd/bases/agent.kagenti.dev_agentcards.yaml @@ -84,8 +84,11 @@ spec: strict: default: false description: |- - Strict enables enforcement mode: binding failures trigger network isolation. - When false (default), results are recorded in status only (audit mode). + Strict controls whether binding failures trigger enforcement actions + (label removal, restrictive NetworkPolicy). + When true, binding failure removes the verified label and applies restrictive NetworkPolicy. + When false (default), binding results are recorded in status only; + the workload retains its verified label and permissive policy. type: boolean trustDomain: description: |- diff --git a/kagenti-operator/config/rbac/role.yaml b/kagenti-operator/config/rbac/role.yaml index 596cf0e3..435f43fd 100644 --- a/kagenti-operator/config/rbac/role.yaml +++ b/kagenti-operator/config/rbac/role.yaml @@ -8,11 +8,13 @@ rules: - "" resources: - configmaps - - namespaces - - services + - secrets verbs: + - create - get - list + - patch + - update - watch - apiGroups: - "" @@ -24,19 +26,17 @@ rules: - apiGroups: - "" resources: - - pods + - namespaces + - services verbs: - get - list - - patch - - update - watch - apiGroups: - "" resources: - - secrets + - pods verbs: - - create - get - list - patch @@ -90,6 +90,13 @@ rules: - patch - update - watch +- apiGroups: + - apps + resources: + - deployments/finalizers + - statefulsets/finalizers + verbs: + - update - apiGroups: - mlflow.opendatahub.io resources: @@ -98,13 +105,6 @@ rules: - get - list - watch -- apiGroups: - - apps - resources: - - deployments/finalizers - - statefulsets/finalizers - verbs: - - update - apiGroups: - networking.k8s.io resources: diff --git a/kagenti-operator/config/webhook/manifests.yaml b/kagenti-operator/config/webhook/manifests.yaml index 41c04ade..434c7a5c 100644 --- a/kagenti-operator/config/webhook/manifests.yaml +++ b/kagenti-operator/config/webhook/manifests.yaml @@ -13,28 +13,6 @@ webhooks: path: /mutate-workloads-authbridge failurePolicy: Fail name: inject.kagenti.io - namespaceSelector: - matchExpressions: - - key: kubernetes.io/metadata.name - operator: NotIn - values: - - kube-system - - kube-public - - kube-node-lease - - kagenti-operator-system - matchLabels: - kagenti-enabled: "true" - objectSelector: - matchExpressions: - - key: kagenti.io/type - operator: In - values: - - agent - - tool - - key: kagenti.io/inject - operator: NotIn - values: - - disabled reinvocationPolicy: IfNeeded rules: - apiGroups: @@ -46,7 +24,6 @@ webhooks: resources: - pods sideEffects: None - timeoutSeconds: 10 --- apiVersion: admissionregistration.k8s.io/v1 kind: ValidatingWebhookConfiguration From c98d034039f48408bddbe8976c2e12813418f7a0 Mon Sep 17 00:00:00 2001 From: Kevin Cogan Date: Fri, 17 Apr 2026 10:34:20 +0100 Subject: [PATCH 06/16] fix(helm): add RBAC for deployment and statefulset finalizers Grant update permission on deployments/finalizers and statefulsets/finalizers so the operator can set blockOwnerDeletion on owned workloads. Signed-off-by: Kevin Cogan --- charts/kagenti-operator/templates/rbac/role.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/charts/kagenti-operator/templates/rbac/role.yaml b/charts/kagenti-operator/templates/rbac/role.yaml index edb39b7e..195cabdd 100755 --- a/charts/kagenti-operator/templates/rbac/role.yaml +++ b/charts/kagenti-operator/templates/rbac/role.yaml @@ -101,6 +101,13 @@ rules: - patch - update - watch +- apiGroups: + - apps + resources: + - deployments/finalizers + - statefulsets/finalizers + verbs: + - update - apiGroups: - mlflow.opendatahub.io resources: From bcd55b4749718e98fcb38bf0efc95cabe57fde28 Mon Sep 17 00:00:00 2001 From: Kevin Cogan Date: Fri, 17 Apr 2026 10:34:46 +0100 Subject: [PATCH 07/16] docs(demos): update enforcement prereqs and add operator-signing demo Point the enforcement demo prerequisite back to agentcard-spire-signing (init-container path). Add a new agentcard-operator-signing demo with deployment manifests, ClusterSPIFFEID, AgentCard CR, and teardown script for the opt-in operator signing path. Signed-off-by: Kevin Cogan --- .../demos/agentcard-enforcement/demo.md | 14 +- .../run-demo-commands.sh | 11 +- .../demos/agentcard-operator-signing/demo.md | 129 ++++++++++++++++++ .../k8s/agent-deployment.yaml | 68 +++++++++ .../k8s/agentcard.yaml | 13 ++ .../k8s/clusterspiffeid.yaml | 12 ++ .../k8s/namespace.yaml | 6 + .../run-demo-commands.sh | 71 ++++++++++ .../teardown-demo.sh | 37 +++++ 9 files changed, 347 insertions(+), 14 deletions(-) create mode 100644 kagenti-operator/demos/agentcard-operator-signing/demo.md create mode 100644 kagenti-operator/demos/agentcard-operator-signing/k8s/agent-deployment.yaml create mode 100644 kagenti-operator/demos/agentcard-operator-signing/k8s/agentcard.yaml create mode 100644 kagenti-operator/demos/agentcard-operator-signing/k8s/clusterspiffeid.yaml create mode 100644 kagenti-operator/demos/agentcard-operator-signing/k8s/namespace.yaml create mode 100755 kagenti-operator/demos/agentcard-operator-signing/run-demo-commands.sh create mode 100755 kagenti-operator/demos/agentcard-operator-signing/teardown-demo.sh diff --git a/kagenti-operator/demos/agentcard-enforcement/demo.md b/kagenti-operator/demos/agentcard-enforcement/demo.md index f7c1e813..8401f754 100644 --- a/kagenti-operator/demos/agentcard-enforcement/demo.md +++ b/kagenti-operator/demos/agentcard-enforcement/demo.md @@ -13,7 +13,7 @@ This demo shows how the operator enforces identity binding through trust-domain |----------|-------------| | Wrong trust domain | Signature stays valid, but binding fails — `Bound=false` | | Binding failure (`strict: true`) | Label removed, restrictive NetworkPolicy applied | -| Binding failure (`strict: false`) | Label removed, restrictive NetworkPolicy applied | +| Binding failure (`strict: false`) | Label retained, permissive NetworkPolicy kept | | Restored trust domain | Binding passes — label restored, permissive NetworkPolicy | ## Run the Demo @@ -39,18 +39,17 @@ Expected output: Identity Match: False Reason: NotBound Label: + NetworkPolicy: weather-agent-signature-policy === 3. Wrong Trust Domain (strict: false) === Verified: True Bound: False Identity Match: False Reason: NotBound - Label: - -=== 4. NetworkPolicy After Binding Failure === + Label: true NetworkPolicy: weather-agent-signature-policy -=== 5. Restored === +=== 4. Restored === Verified: True Bound: True Identity Match: True @@ -64,8 +63,9 @@ Expected output: 1. The operator evaluates identity binding on every reconciliation 2. When `spec.identityBinding.trustDomain` is set, it overrides the operator-level `--spire-trust-domain` 3. If the SPIFFE ID from the x5c chain doesn't match the configured trust domain, binding fails -4. Binding failure always removes the `signature-verified` label from the workload -5. The NetworkPolicy controller applies a restrictive policy when the label is absent, permissive when present +4. `strict: true` — binding failure removes the `signature-verified` label and triggers restrictive NetworkPolicy +5. `strict: false` (default) — binding failure is recorded in status only; the label and permissive policy are retained +6. Status always reflects the true binding result regardless of `strict` ## Cleanup diff --git a/kagenti-operator/demos/agentcard-enforcement/run-demo-commands.sh b/kagenti-operator/demos/agentcard-enforcement/run-demo-commands.sh index b9590827..71154cbb 100755 --- a/kagenti-operator/demos/agentcard-enforcement/run-demo-commands.sh +++ b/kagenti-operator/demos/agentcard-enforcement/run-demo-commands.sh @@ -59,6 +59,7 @@ echo "Waiting for reconciliation..." sleep 20 get_status get_label +get_netpol echo "" # ── 3. Wrong trust domain with strict: false ───────────────────────────────── @@ -75,15 +76,11 @@ echo "Waiting for reconciliation..." sleep 20 get_status get_label -echo "" - -# ── 4. NetworkPolicy after binding failure ─────────────────────────────────── -echo "=== 4. NetworkPolicy After Binding Failure ===" get_netpol echo "" -# ── 5. Restore correct binding ────────────────────────────────────────────── -echo "=== 5. Restoring correct binding ===" +# ── 4. Restore correct binding ────────────────────────────────────────────── +echo "=== 4. Restoring correct binding ===" kubectl patch agentcard "$AGENTCARD" -n "$NAMESPACE" --type=json -p '[ {"op": "remove", "path": "/spec/identityBinding"} ]' @@ -97,7 +94,7 @@ kubectl patch agentcard "$AGENTCARD" -n "$NAMESPACE" --type=merge -p '{ echo "Waiting for reconciliation..." sleep 20 echo "" -echo "=== 5. Restored ===" +echo "=== 4. Restored ===" get_status get_label get_netpol diff --git a/kagenti-operator/demos/agentcard-operator-signing/demo.md b/kagenti-operator/demos/agentcard-operator-signing/demo.md new file mode 100644 index 00000000..420eb43a --- /dev/null +++ b/kagenti-operator/demos/agentcard-operator-signing/demo.md @@ -0,0 +1,129 @@ +# Operator Signing Demo + +This demo shows automated AgentCard signing by the kagenti operator using its +own SPIFFE identity, with x5c signature verification and trust-domain identity +binding. + +## Overview + +``` +SPIRE Server + | + | issues X.509-SVID to operator + v +Operator Pod Agent Pod ++---------------------------+ +---------------------------+ +| | | | +| kagenti-operator | -- fetch --> | serves /.well-known/ | +| HTTP GET agent card | | agent-card.json | +| signs with operator | | (rich card with skills) | +| SPIFFE identity | | | +| verifies x5c chain | +---------------------------+ +| validates trust domain | +| sets Verified + Bound | +| | ++---------------------------+ +``` + +The agent serves its card at `/.well-known/agent-card.json`. The operator +fetches it via HTTP, signs it with a JWS (ES256) using the operator's own +SPIFFE SVID, verifies the signature against the SPIRE trust bundle, and +validates the signer's SPIFFE ID belongs to the configured trust domain. + +## Prerequisites + +- Kubernetes/OpenShift cluster with SPIRE installed +- `spire-controller-manager` running (for ClusterSPIFFEID support) +- SPIFFE CSI driver available (`csi.spiffe.io`) +- Trust bundle ConfigMap labeled `kagenti.io/defaults=true` +- kagenti-operator deployed with `signatureVerification.enabled=true` + +## Setup + +### 1. Deploy the Demo + +No images to build — the agent uses a standard `python:3.11-slim` image. + +```bash +kubectl apply -f demos/agentcard-operator-signing/k8s/namespace.yaml +kubectl apply -f demos/agentcard-operator-signing/k8s/clusterspiffeid.yaml +kubectl apply -f demos/agentcard-operator-signing/k8s/agent-deployment.yaml +kubectl apply -f demos/agentcard-operator-signing/k8s/agentcard.yaml +``` + +### 2. Wait for the Agent + +```bash +kubectl wait --for=condition=available --timeout=120s \ + deployment/weather-agent -n agents +``` + +### 3. Wait for Signing + +The operator signs the card on the next reconciliation cycle (~30s): + +```bash +sleep 30 +kubectl get agentcard weather-agent-card -n agents +# Expect: VERIFIED=true, BOUND=true, SYNCED=True +``` + +## Test the Flow + +Run the demo script to see signing and verification in action: + +```bash +./demos/agentcard-operator-signing/run-demo-commands.sh +``` + +Expected output: + +``` +=== 1. Agent Card Served via HTTP === + Name: Weather Agent + Skills: ['get_weather'] + +=== 2. Operator Signing Log === + Workload: weather-agent + SPIFFE ID: spiffe:///ns//sa/controller-manager + +=== 3. JWS Protected Header === + Algorithm: ES256 + Type: JOSE + Key ID: <16-char hex> + x5c certs: 1 + +=== 4. Operator Verification Status === + SignatureVerified True (SignatureValid) + Bound True (Bound) + Synced True (SyncSucceeded) + +=== 5. Identity Binding === + SPIFFE ID: spiffe:///ns//sa/controller-manager + Identity Match: True + Bound: True + +=== 6. Signature Label === + agent.kagenti.dev/signature-verified: true + +=== 7. AgentCard Summary === +NAME PROTOCOL KIND TARGET AGENT VERIFIED BOUND SYNCED ... +weather-agent-card a2a Deployment weather-agent Weather Agent true true True ... +``` + +## How It Works + +1. The agent serves its card at `/.well-known/agent-card.json` via HTTP +2. The operator fetches the card during reconciliation +3. The operator signs the card with JWS (ES256) using its own SPIFFE SVID +4. The operator verifies the signature against the SPIRE trust bundle +5. The SPIFFE ID from the leaf certificate's SAN URI is extracted +6. If the SPIFFE ID belongs to the configured trust domain, the card is marked as Bound +7. The `agent.kagenti.dev/signature-verified` label is set on the workload +8. The signed card is stored in the AgentCard CR status + +## Cleanup + +```bash +./demos/agentcard-operator-signing/teardown-demo.sh +``` diff --git a/kagenti-operator/demos/agentcard-operator-signing/k8s/agent-deployment.yaml b/kagenti-operator/demos/agentcard-operator-signing/k8s/agent-deployment.yaml new file mode 100644 index 00000000..89f6bf6f --- /dev/null +++ b/kagenti-operator/demos/agentcard-operator-signing/k8s/agent-deployment.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: weather-agent + namespace: agents + labels: + kagenti.io/type: agent + protocol.kagenti.io/a2a: "" + app.kubernetes.io/name: weather-agent +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: weather-agent + kagenti.io/type: agent + template: + metadata: + labels: + app.kubernetes.io/name: weather-agent + kagenti.io/type: agent + protocol.kagenti.io/a2a: "" + istio.io/dataplane-mode: none + spec: + containers: + - name: agent + image: docker.io/python:3.11-slim + command: + - python3 + - -c + - | + import http.server, json + class H(http.server.BaseHTTPRequestHandler): + def do_GET(self): + if self.path == '/.well-known/agent-card.json': + card = {'name': 'Weather Agent', + 'description': 'Provides weather forecasts and current conditions', + 'url': 'http://weather-agent.agents.svc.cluster.local:8080', + 'version': '1.0.0', + 'capabilities': {'streaming': False, 'pushNotifications': False}, + 'defaultInputModes': ['text/plain'], + 'defaultOutputModes': ['text/plain'], + 'skills': [{'name': 'get_weather', + 'description': 'Get current weather for a location', + 'inputModes': ['text/plain'], + 'outputModes': ['text/plain']}]} + self.send_response(200) + self.send_header('Content-Type', 'application/json') + self.end_headers() + self.wfile.write(json.dumps(card).encode()) + else: + self.send_response(404) + self.end_headers() + def log_message(self, *a): pass + http.server.HTTPServer(('', 8080), H).serve_forever() + ports: + - containerPort: 8080 +--- +apiVersion: v1 +kind: Service +metadata: + name: weather-agent + namespace: agents +spec: + selector: + app.kubernetes.io/name: weather-agent + ports: + - port: 8080 + targetPort: 8080 diff --git a/kagenti-operator/demos/agentcard-operator-signing/k8s/agentcard.yaml b/kagenti-operator/demos/agentcard-operator-signing/k8s/agentcard.yaml new file mode 100644 index 00000000..1f619a06 --- /dev/null +++ b/kagenti-operator/demos/agentcard-operator-signing/k8s/agentcard.yaml @@ -0,0 +1,13 @@ +apiVersion: agent.kagenti.dev/v1alpha1 +kind: AgentCard +metadata: + name: weather-agent-card + namespace: agents +spec: + syncPeriod: "30s" + targetRef: + apiVersion: apps/v1 + kind: Deployment + name: weather-agent + identityBinding: + strict: true diff --git a/kagenti-operator/demos/agentcard-operator-signing/k8s/clusterspiffeid.yaml b/kagenti-operator/demos/agentcard-operator-signing/k8s/clusterspiffeid.yaml new file mode 100644 index 00000000..b9e581d6 --- /dev/null +++ b/kagenti-operator/demos/agentcard-operator-signing/k8s/clusterspiffeid.yaml @@ -0,0 +1,12 @@ +apiVersion: spire.spiffe.io/v1alpha1 +kind: ClusterSPIFFEID +metadata: + name: agents +spec: + spiffeIDTemplate: "spiffe://{{ .TrustDomain }}/ns/{{ .PodMeta.Namespace }}/sa/{{ .PodSpec.ServiceAccountName }}" + podSelector: + matchLabels: + kagenti.io/type: agent + namespaceSelector: + matchLabels: + agentcard: "true" diff --git a/kagenti-operator/demos/agentcard-operator-signing/k8s/namespace.yaml b/kagenti-operator/demos/agentcard-operator-signing/k8s/namespace.yaml new file mode 100644 index 00000000..927e63d8 --- /dev/null +++ b/kagenti-operator/demos/agentcard-operator-signing/k8s/namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: agents + labels: + agentcard: "true" diff --git a/kagenti-operator/demos/agentcard-operator-signing/run-demo-commands.sh b/kagenti-operator/demos/agentcard-operator-signing/run-demo-commands.sh new file mode 100755 index 00000000..5c5b87b8 --- /dev/null +++ b/kagenti-operator/demos/agentcard-operator-signing/run-demo-commands.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +## +# Run verification commands for the operator signing demo. +# Assumes setup is complete (see demo.md). +# + +set -euo pipefail + +NAMESPACE="${NAMESPACE:-agents}" +OPERATOR_NS="${OPERATOR_NS:-kagenti-system}" +OPERATOR_DEPLOY="${OPERATOR_DEPLOY:-kagenti-controller-manager}" + +echo "=== 1. Agent Card Served via HTTP ===" +POD=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/name=weather-agent \ + --field-selector=status.phase=Running -o jsonpath='{.items[0].metadata.name}') +kubectl exec -n "$NAMESPACE" "$POD" -c agent -- \ + python3 -c "import urllib.request, json; d=json.loads(urllib.request.urlopen('http://localhost:8080/.well-known/agent-card.json').read()); print(f' Name: {d[\"name\"]}'); print(f' Skills: {[s[\"name\"] for s in d.get(\"skills\",[])]}')" +echo "" + +echo "=== 2. Operator Signing Log ===" +kubectl logs -n "$OPERATOR_NS" deployment/"$OPERATOR_DEPLOY" 2>&1 | \ + grep "Operator signed agent card" | grep "weather-agent" | tail -1 | \ + python3 -c "import sys,json; d=json.loads(sys.stdin.read()); print(f' Workload: {d[\"workload\"]}'); print(f' SPIFFE ID: {d[\"spiffeID\"]}')" 2>/dev/null || \ + echo " (no signing log found — wait for reconciliation)" +echo "" + +echo "=== 3. JWS Protected Header ===" +kubectl get agentcard weather-agent-card -n "$NAMESPACE" \ + -o jsonpath='{.status.card.signatures[0].protected}' | python3 -c " +import sys, base64, json +b64 = sys.stdin.read().strip() +if b64: + header = json.loads(base64.urlsafe_b64decode(b64 + '==')) + print(f' Algorithm: {header.get(\"alg\")}') + print(f' Type: {header.get(\"typ\")}') + print(f' Key ID: {header.get(\"kid\")}') + print(f' x5c certs: {len(header.get(\"x5c\", []))}') +else: + print(' (no signature yet — wait for reconciliation)') +" +echo "" + +echo "=== 4. Operator Verification Status ===" +kubectl get agentcard weather-agent-card -n "$NAMESPACE" \ + -o jsonpath='{.status.conditions}' | python3 -c " +import sys, json +for c in json.loads(sys.stdin.read()): + if c['type'] in ('SignatureVerified', 'Bound', 'Synced'): + print(f' {c[\"type\"]:20s} {c[\"status\"]:5s} ({c[\"reason\"]})') +" +echo "" + +echo "=== 5. Identity Binding ===" +kubectl get agentcard weather-agent-card -n "$NAMESPACE" \ + -o jsonpath='{.status}' | python3 -c " +import sys, json +s = json.loads(sys.stdin.read()) +print(f' SPIFFE ID: {s.get(\"signatureSpiffeId\", \"(none)\")}') +print(f' Identity Match: {s.get(\"signatureIdentityMatch\")}') +print(f' Bound: {s.get(\"bindingStatus\", {}).get(\"bound\")}') +" +echo "" + +echo "=== 6. Signature Label ===" +LABEL=$(kubectl get deployment weather-agent -n "$NAMESPACE" \ + -o jsonpath='{.spec.template.metadata.labels.agent\.kagenti\.dev/signature-verified}') +echo " agent.kagenti.dev/signature-verified: ${LABEL:-}" +echo "" + +echo "=== 7. AgentCard Summary ===" +kubectl get agentcard -n "$NAMESPACE" diff --git a/kagenti-operator/demos/agentcard-operator-signing/teardown-demo.sh b/kagenti-operator/demos/agentcard-operator-signing/teardown-demo.sh new file mode 100755 index 00000000..ad36314f --- /dev/null +++ b/kagenti-operator/demos/agentcard-operator-signing/teardown-demo.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# +# Teardown script for the operator signing demo. +# Deletes k8s resources created by the demo. +# + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +K8S_DIR="${SCRIPT_DIR}/k8s" + +NAMESPACE="${NAMESPACE:-agents}" + +echo "=== Operator Signing Demo Teardown ===" +echo "" + +echo "Deleting Kubernetes resources..." +kubectl delete -f "${K8S_DIR}/agentcard.yaml" --ignore-not-found=true 2>/dev/null || true +kubectl delete -f "${K8S_DIR}/agent-deployment.yaml" --ignore-not-found=true 2>/dev/null || true +kubectl delete -f "${K8S_DIR}/clusterspiffeid.yaml" --ignore-not-found=true 2>/dev/null || true +echo "Kubernetes resources deleted." +echo "" + +echo "Deleting namespace '${NAMESPACE}'..." +kubectl delete namespace "${NAMESPACE}" --wait=false --ignore-not-found=true 2>/dev/null || true + +sleep 5 +if kubectl get namespace "${NAMESPACE}" 2>/dev/null | grep -q Terminating; then + echo "Namespace stuck in Terminating — force-finalizing..." + kubectl get namespace "${NAMESPACE}" -o json | \ + python3 -c "import sys,json; ns=json.load(sys.stdin); ns['spec']['finalizers']=[]; print(json.dumps(ns))" | \ + kubectl replace --raw "/api/v1/namespaces/${NAMESPACE}/finalize" -f - >/dev/null 2>&1 || true +fi +echo "Namespace deleted." +echo "" + +echo "=== Teardown Complete ===" From 71a2235c67ab57929aa139daee1470131ab91f9f Mon Sep 17 00:00:00 2001 From: Kevin Cogan Date: Fri, 17 Apr 2026 10:35:57 +0100 Subject: [PATCH 08/16] feat(agentcard): add strict mode and gate operator signing behind conditional fetcher Add IdentityBinding.Strict field (default false) for per-card binding enforcement granularity. When strict is true, a trust-domain mismatch removes the signature-verified label; when false, the mismatch is recorded in status but the label is preserved. Gate the AgentCard fetcher on --enable-operator-signing: when disabled (default), the reconciler uses ConfigMapFetcher to read init-container- signed cards; when enabled, it uses DefaultFetcher to fetch cards via HTTP for operator-side signing. Signed-off-by: Kevin Cogan --- kagenti-operator/api/v1alpha1/agentcard_types.go | 7 +++++-- kagenti-operator/cmd/main.go | 7 ++++++- .../internal/controller/agentcard_controller.go | 11 ++++++----- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/kagenti-operator/api/v1alpha1/agentcard_types.go b/kagenti-operator/api/v1alpha1/agentcard_types.go index a7306ea1..fb7cfc34 100644 --- a/kagenti-operator/api/v1alpha1/agentcard_types.go +++ b/kagenti-operator/api/v1alpha1/agentcard_types.go @@ -48,8 +48,11 @@ type IdentityBinding struct { // +kubebuilder:validation:Pattern=`^[a-zA-Z0-9]([a-zA-Z0-9\-\.]*[a-zA-Z0-9])?$` TrustDomain string `json:"trustDomain,omitempty"` - // Strict enables enforcement mode: binding failures trigger network isolation. - // When false (default), results are recorded in status only (audit mode). + // Strict controls whether binding failures trigger enforcement actions + // (label removal, restrictive NetworkPolicy). + // When true, binding failure removes the verified label and applies restrictive NetworkPolicy. + // When false (default), binding results are recorded in status only; + // the workload retains its verified label and permissive policy. // +optional // +kubebuilder:default=false Strict bool `json:"strict,omitempty"` diff --git a/kagenti-operator/cmd/main.go b/kagenti-operator/cmd/main.go index 45cc30d3..e194450f 100644 --- a/kagenti-operator/cmd/main.go +++ b/kagenti-operator/cmd/main.go @@ -347,11 +347,16 @@ func main() { "auditMode", signatureAuditMode) } + agentFetcher := agentcard.NewConfigMapFetcher(mgr.GetAPIReader()) + if enableOperatorSigning { + agentFetcher = agentcard.NewFetcher() + } + agentCardReconciler := &controller.AgentCardReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), Recorder: mgr.GetEventRecorderFor("agentcard-controller"), - AgentFetcher: agentcard.NewConfigMapFetcher(mgr.GetAPIReader()), + AgentFetcher: agentFetcher, SignatureProvider: sigProvider, RequireSignature: requireA2ASignature, SignatureAuditMode: signatureAuditMode, diff --git a/kagenti-operator/internal/controller/agentcard_controller.go b/kagenti-operator/internal/controller/agentcard_controller.go index 2e73a22f..81c3264c 100644 --- a/kagenti-operator/internal/controller/agentcard_controller.go +++ b/kagenti-operator/internal/controller/agentcard_controller.go @@ -348,11 +348,12 @@ func (r *AgentCardReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, err } - // Both signature and binding (if configured) must pass for the label. + // Signature must pass for the label. Binding must also pass when strict: true; + // when strict: false (default), binding failures are recorded in status only. if r.RequireSignature { isVerified := sigVerified if agentCard.Spec.IdentityBinding != nil { - isVerified = isVerified && bindingPassed + isVerified = isVerified && (bindingPassed || !agentCard.Spec.IdentityBinding.Strict) } if err := r.propagateSignatureLabel(ctx, agentCard.Name, workload, isVerified); err != nil { agentCardLogger.Error(err, "Failed to propagate signature label to workload", @@ -1151,9 +1152,9 @@ func (r *AgentCardReconciler) computeCardId(cardData *agentv1alpha1.AgentCardDat // 1. The leaf SVID cert is approaching expiry (within SVIDExpiryGracePeriod). // 2. The trust bundle hash changed since the workload was last (re)started. // -// When operator signing is enabled, the operator re-signs on each reconcile using its -// auto-rotating X509Source, so pod restarts for re-signing are unnecessary. -// In legacy init-container mode, the pod template annotation is patched to trigger +// When operator signing is enabled (EnableOperatorSign), the operator re-signs on each +// reconcile using its auto-rotating X509Source, so pod restarts for re-signing are +// unnecessary. In init-container mode, the pod template annotation is patched to trigger // a rollout so the init-container re-runs and fetches a fresh SVID. func (r *AgentCardReconciler) maybeRestartForResign(ctx context.Context, agentCard *agentv1alpha1.AgentCard, workload *WorkloadInfo, vr *signature.VerificationResult) { if r.EnableOperatorSign { From b0a2f25a1c38de0f401f04411e3971aae342a9f4 Mon Sep 17 00:00:00 2001 From: Kevin Cogan Date: Thu, 7 May 2026 22:37:40 +0100 Subject: [PATCH 09/16] feat: remove operator-signing in favor of mTLS verified fetch Operator-signing wrote signed ConfigMaps on behalf of agents, adding complexity without meaningful trust improvement. Replace with Phase 1 mTLS verified fetch which authenticates agents directly via SPIFFE. Removes CardSigner, CardWriter, SpiffeSigner, and the operator-signing demo. Signed-off-by: Kevin Cogan --- .../demos/agentcard-operator-signing/demo.md | 129 -------------- .../k8s/agent-deployment.yaml | 68 -------- .../k8s/agentcard.yaml | 13 -- .../k8s/clusterspiffeid.yaml | 12 -- .../k8s/namespace.yaml | 6 - .../run-demo-commands.sh | 71 -------- .../teardown-demo.sh | 37 ---- kagenti-operator/internal/agentcard/writer.go | 99 ----------- .../internal/agentcard/writer_test.go | 140 --------------- kagenti-operator/internal/signature/signer.go | 12 +- .../internal/signature/spiffe_signer.go | 129 -------------- .../internal/signature/spiffe_signer_test.go | 161 ------------------ 12 files changed, 1 insertion(+), 876 deletions(-) delete mode 100644 kagenti-operator/demos/agentcard-operator-signing/demo.md delete mode 100644 kagenti-operator/demos/agentcard-operator-signing/k8s/agent-deployment.yaml delete mode 100644 kagenti-operator/demos/agentcard-operator-signing/k8s/agentcard.yaml delete mode 100644 kagenti-operator/demos/agentcard-operator-signing/k8s/clusterspiffeid.yaml delete mode 100644 kagenti-operator/demos/agentcard-operator-signing/k8s/namespace.yaml delete mode 100755 kagenti-operator/demos/agentcard-operator-signing/run-demo-commands.sh delete mode 100755 kagenti-operator/demos/agentcard-operator-signing/teardown-demo.sh delete mode 100644 kagenti-operator/internal/agentcard/writer.go delete mode 100644 kagenti-operator/internal/agentcard/writer_test.go delete mode 100644 kagenti-operator/internal/signature/spiffe_signer.go delete mode 100644 kagenti-operator/internal/signature/spiffe_signer_test.go diff --git a/kagenti-operator/demos/agentcard-operator-signing/demo.md b/kagenti-operator/demos/agentcard-operator-signing/demo.md deleted file mode 100644 index 420eb43a..00000000 --- a/kagenti-operator/demos/agentcard-operator-signing/demo.md +++ /dev/null @@ -1,129 +0,0 @@ -# Operator Signing Demo - -This demo shows automated AgentCard signing by the kagenti operator using its -own SPIFFE identity, with x5c signature verification and trust-domain identity -binding. - -## Overview - -``` -SPIRE Server - | - | issues X.509-SVID to operator - v -Operator Pod Agent Pod -+---------------------------+ +---------------------------+ -| | | | -| kagenti-operator | -- fetch --> | serves /.well-known/ | -| HTTP GET agent card | | agent-card.json | -| signs with operator | | (rich card with skills) | -| SPIFFE identity | | | -| verifies x5c chain | +---------------------------+ -| validates trust domain | -| sets Verified + Bound | -| | -+---------------------------+ -``` - -The agent serves its card at `/.well-known/agent-card.json`. The operator -fetches it via HTTP, signs it with a JWS (ES256) using the operator's own -SPIFFE SVID, verifies the signature against the SPIRE trust bundle, and -validates the signer's SPIFFE ID belongs to the configured trust domain. - -## Prerequisites - -- Kubernetes/OpenShift cluster with SPIRE installed -- `spire-controller-manager` running (for ClusterSPIFFEID support) -- SPIFFE CSI driver available (`csi.spiffe.io`) -- Trust bundle ConfigMap labeled `kagenti.io/defaults=true` -- kagenti-operator deployed with `signatureVerification.enabled=true` - -## Setup - -### 1. Deploy the Demo - -No images to build — the agent uses a standard `python:3.11-slim` image. - -```bash -kubectl apply -f demos/agentcard-operator-signing/k8s/namespace.yaml -kubectl apply -f demos/agentcard-operator-signing/k8s/clusterspiffeid.yaml -kubectl apply -f demos/agentcard-operator-signing/k8s/agent-deployment.yaml -kubectl apply -f demos/agentcard-operator-signing/k8s/agentcard.yaml -``` - -### 2. Wait for the Agent - -```bash -kubectl wait --for=condition=available --timeout=120s \ - deployment/weather-agent -n agents -``` - -### 3. Wait for Signing - -The operator signs the card on the next reconciliation cycle (~30s): - -```bash -sleep 30 -kubectl get agentcard weather-agent-card -n agents -# Expect: VERIFIED=true, BOUND=true, SYNCED=True -``` - -## Test the Flow - -Run the demo script to see signing and verification in action: - -```bash -./demos/agentcard-operator-signing/run-demo-commands.sh -``` - -Expected output: - -``` -=== 1. Agent Card Served via HTTP === - Name: Weather Agent - Skills: ['get_weather'] - -=== 2. Operator Signing Log === - Workload: weather-agent - SPIFFE ID: spiffe:///ns//sa/controller-manager - -=== 3. JWS Protected Header === - Algorithm: ES256 - Type: JOSE - Key ID: <16-char hex> - x5c certs: 1 - -=== 4. Operator Verification Status === - SignatureVerified True (SignatureValid) - Bound True (Bound) - Synced True (SyncSucceeded) - -=== 5. Identity Binding === - SPIFFE ID: spiffe:///ns//sa/controller-manager - Identity Match: True - Bound: True - -=== 6. Signature Label === - agent.kagenti.dev/signature-verified: true - -=== 7. AgentCard Summary === -NAME PROTOCOL KIND TARGET AGENT VERIFIED BOUND SYNCED ... -weather-agent-card a2a Deployment weather-agent Weather Agent true true True ... -``` - -## How It Works - -1. The agent serves its card at `/.well-known/agent-card.json` via HTTP -2. The operator fetches the card during reconciliation -3. The operator signs the card with JWS (ES256) using its own SPIFFE SVID -4. The operator verifies the signature against the SPIRE trust bundle -5. The SPIFFE ID from the leaf certificate's SAN URI is extracted -6. If the SPIFFE ID belongs to the configured trust domain, the card is marked as Bound -7. The `agent.kagenti.dev/signature-verified` label is set on the workload -8. The signed card is stored in the AgentCard CR status - -## Cleanup - -```bash -./demos/agentcard-operator-signing/teardown-demo.sh -``` diff --git a/kagenti-operator/demos/agentcard-operator-signing/k8s/agent-deployment.yaml b/kagenti-operator/demos/agentcard-operator-signing/k8s/agent-deployment.yaml deleted file mode 100644 index 89f6bf6f..00000000 --- a/kagenti-operator/demos/agentcard-operator-signing/k8s/agent-deployment.yaml +++ /dev/null @@ -1,68 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: weather-agent - namespace: agents - labels: - kagenti.io/type: agent - protocol.kagenti.io/a2a: "" - app.kubernetes.io/name: weather-agent -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: weather-agent - kagenti.io/type: agent - template: - metadata: - labels: - app.kubernetes.io/name: weather-agent - kagenti.io/type: agent - protocol.kagenti.io/a2a: "" - istio.io/dataplane-mode: none - spec: - containers: - - name: agent - image: docker.io/python:3.11-slim - command: - - python3 - - -c - - | - import http.server, json - class H(http.server.BaseHTTPRequestHandler): - def do_GET(self): - if self.path == '/.well-known/agent-card.json': - card = {'name': 'Weather Agent', - 'description': 'Provides weather forecasts and current conditions', - 'url': 'http://weather-agent.agents.svc.cluster.local:8080', - 'version': '1.0.0', - 'capabilities': {'streaming': False, 'pushNotifications': False}, - 'defaultInputModes': ['text/plain'], - 'defaultOutputModes': ['text/plain'], - 'skills': [{'name': 'get_weather', - 'description': 'Get current weather for a location', - 'inputModes': ['text/plain'], - 'outputModes': ['text/plain']}]} - self.send_response(200) - self.send_header('Content-Type', 'application/json') - self.end_headers() - self.wfile.write(json.dumps(card).encode()) - else: - self.send_response(404) - self.end_headers() - def log_message(self, *a): pass - http.server.HTTPServer(('', 8080), H).serve_forever() - ports: - - containerPort: 8080 ---- -apiVersion: v1 -kind: Service -metadata: - name: weather-agent - namespace: agents -spec: - selector: - app.kubernetes.io/name: weather-agent - ports: - - port: 8080 - targetPort: 8080 diff --git a/kagenti-operator/demos/agentcard-operator-signing/k8s/agentcard.yaml b/kagenti-operator/demos/agentcard-operator-signing/k8s/agentcard.yaml deleted file mode 100644 index 1f619a06..00000000 --- a/kagenti-operator/demos/agentcard-operator-signing/k8s/agentcard.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: agent.kagenti.dev/v1alpha1 -kind: AgentCard -metadata: - name: weather-agent-card - namespace: agents -spec: - syncPeriod: "30s" - targetRef: - apiVersion: apps/v1 - kind: Deployment - name: weather-agent - identityBinding: - strict: true diff --git a/kagenti-operator/demos/agentcard-operator-signing/k8s/clusterspiffeid.yaml b/kagenti-operator/demos/agentcard-operator-signing/k8s/clusterspiffeid.yaml deleted file mode 100644 index b9e581d6..00000000 --- a/kagenti-operator/demos/agentcard-operator-signing/k8s/clusterspiffeid.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: spire.spiffe.io/v1alpha1 -kind: ClusterSPIFFEID -metadata: - name: agents -spec: - spiffeIDTemplate: "spiffe://{{ .TrustDomain }}/ns/{{ .PodMeta.Namespace }}/sa/{{ .PodSpec.ServiceAccountName }}" - podSelector: - matchLabels: - kagenti.io/type: agent - namespaceSelector: - matchLabels: - agentcard: "true" diff --git a/kagenti-operator/demos/agentcard-operator-signing/k8s/namespace.yaml b/kagenti-operator/demos/agentcard-operator-signing/k8s/namespace.yaml deleted file mode 100644 index 927e63d8..00000000 --- a/kagenti-operator/demos/agentcard-operator-signing/k8s/namespace.yaml +++ /dev/null @@ -1,6 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: agents - labels: - agentcard: "true" diff --git a/kagenti-operator/demos/agentcard-operator-signing/run-demo-commands.sh b/kagenti-operator/demos/agentcard-operator-signing/run-demo-commands.sh deleted file mode 100755 index 5c5b87b8..00000000 --- a/kagenti-operator/demos/agentcard-operator-signing/run-demo-commands.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env bash -## -# Run verification commands for the operator signing demo. -# Assumes setup is complete (see demo.md). -# - -set -euo pipefail - -NAMESPACE="${NAMESPACE:-agents}" -OPERATOR_NS="${OPERATOR_NS:-kagenti-system}" -OPERATOR_DEPLOY="${OPERATOR_DEPLOY:-kagenti-controller-manager}" - -echo "=== 1. Agent Card Served via HTTP ===" -POD=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/name=weather-agent \ - --field-selector=status.phase=Running -o jsonpath='{.items[0].metadata.name}') -kubectl exec -n "$NAMESPACE" "$POD" -c agent -- \ - python3 -c "import urllib.request, json; d=json.loads(urllib.request.urlopen('http://localhost:8080/.well-known/agent-card.json').read()); print(f' Name: {d[\"name\"]}'); print(f' Skills: {[s[\"name\"] for s in d.get(\"skills\",[])]}')" -echo "" - -echo "=== 2. Operator Signing Log ===" -kubectl logs -n "$OPERATOR_NS" deployment/"$OPERATOR_DEPLOY" 2>&1 | \ - grep "Operator signed agent card" | grep "weather-agent" | tail -1 | \ - python3 -c "import sys,json; d=json.loads(sys.stdin.read()); print(f' Workload: {d[\"workload\"]}'); print(f' SPIFFE ID: {d[\"spiffeID\"]}')" 2>/dev/null || \ - echo " (no signing log found — wait for reconciliation)" -echo "" - -echo "=== 3. JWS Protected Header ===" -kubectl get agentcard weather-agent-card -n "$NAMESPACE" \ - -o jsonpath='{.status.card.signatures[0].protected}' | python3 -c " -import sys, base64, json -b64 = sys.stdin.read().strip() -if b64: - header = json.loads(base64.urlsafe_b64decode(b64 + '==')) - print(f' Algorithm: {header.get(\"alg\")}') - print(f' Type: {header.get(\"typ\")}') - print(f' Key ID: {header.get(\"kid\")}') - print(f' x5c certs: {len(header.get(\"x5c\", []))}') -else: - print(' (no signature yet — wait for reconciliation)') -" -echo "" - -echo "=== 4. Operator Verification Status ===" -kubectl get agentcard weather-agent-card -n "$NAMESPACE" \ - -o jsonpath='{.status.conditions}' | python3 -c " -import sys, json -for c in json.loads(sys.stdin.read()): - if c['type'] in ('SignatureVerified', 'Bound', 'Synced'): - print(f' {c[\"type\"]:20s} {c[\"status\"]:5s} ({c[\"reason\"]})') -" -echo "" - -echo "=== 5. Identity Binding ===" -kubectl get agentcard weather-agent-card -n "$NAMESPACE" \ - -o jsonpath='{.status}' | python3 -c " -import sys, json -s = json.loads(sys.stdin.read()) -print(f' SPIFFE ID: {s.get(\"signatureSpiffeId\", \"(none)\")}') -print(f' Identity Match: {s.get(\"signatureIdentityMatch\")}') -print(f' Bound: {s.get(\"bindingStatus\", {}).get(\"bound\")}') -" -echo "" - -echo "=== 6. Signature Label ===" -LABEL=$(kubectl get deployment weather-agent -n "$NAMESPACE" \ - -o jsonpath='{.spec.template.metadata.labels.agent\.kagenti\.dev/signature-verified}') -echo " agent.kagenti.dev/signature-verified: ${LABEL:-}" -echo "" - -echo "=== 7. AgentCard Summary ===" -kubectl get agentcard -n "$NAMESPACE" diff --git a/kagenti-operator/demos/agentcard-operator-signing/teardown-demo.sh b/kagenti-operator/demos/agentcard-operator-signing/teardown-demo.sh deleted file mode 100755 index ad36314f..00000000 --- a/kagenti-operator/demos/agentcard-operator-signing/teardown-demo.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env bash -# -# Teardown script for the operator signing demo. -# Deletes k8s resources created by the demo. -# - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -K8S_DIR="${SCRIPT_DIR}/k8s" - -NAMESPACE="${NAMESPACE:-agents}" - -echo "=== Operator Signing Demo Teardown ===" -echo "" - -echo "Deleting Kubernetes resources..." -kubectl delete -f "${K8S_DIR}/agentcard.yaml" --ignore-not-found=true 2>/dev/null || true -kubectl delete -f "${K8S_DIR}/agent-deployment.yaml" --ignore-not-found=true 2>/dev/null || true -kubectl delete -f "${K8S_DIR}/clusterspiffeid.yaml" --ignore-not-found=true 2>/dev/null || true -echo "Kubernetes resources deleted." -echo "" - -echo "Deleting namespace '${NAMESPACE}'..." -kubectl delete namespace "${NAMESPACE}" --wait=false --ignore-not-found=true 2>/dev/null || true - -sleep 5 -if kubectl get namespace "${NAMESPACE}" 2>/dev/null | grep -q Terminating; then - echo "Namespace stuck in Terminating — force-finalizing..." - kubectl get namespace "${NAMESPACE}" -o json | \ - python3 -c "import sys,json; ns=json.load(sys.stdin); ns['spec']['finalizers']=[]; print(json.dumps(ns))" | \ - kubectl replace --raw "/api/v1/namespaces/${NAMESPACE}/finalize" -f - >/dev/null 2>&1 || true -fi -echo "Namespace deleted." -echo "" - -echo "=== Teardown Complete ===" diff --git a/kagenti-operator/internal/agentcard/writer.go b/kagenti-operator/internal/agentcard/writer.go deleted file mode 100644 index 3e93c4cf..00000000 --- a/kagenti-operator/internal/agentcard/writer.go +++ /dev/null @@ -1,99 +0,0 @@ -/* -Copyright 2025. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package agentcard - -import ( - "context" - "fmt" - - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - - agentv1alpha1 "github.com/kagenti/operator/api/v1alpha1" -) - -var writerLogger = ctrl.Log.WithName("agentcard").WithName("writer") - -// Writer creates or updates signed AgentCard ConfigMaps. -type Writer struct { - client client.Client - reader client.Reader - scheme *runtime.Scheme -} - -// NewWriter creates a Writer for signed AgentCard ConfigMaps. -// The reader should be an uncached API reader (mgr.GetAPIReader()) to avoid -// cache-scoping issues when the ConfigMap is in a different namespace. -func NewWriter(c client.Client, reader client.Reader, scheme *runtime.Scheme) *Writer { - return &Writer{client: c, reader: reader, scheme: scheme} -} - -// ConfigMapName returns the canonical ConfigMap name for a signed agent card. -func ConfigMapName(agentName string) string { - return agentName + SignedCardConfigMapSuffix -} - -// WriteSignedCard creates or updates the signed card ConfigMap, setting an -// owner reference to the AgentCard CR for automatic garbage collection. -func (w *Writer) WriteSignedCard(ctx context.Context, owner *agentv1alpha1.AgentCard, agentName, namespace string, signedCard []byte) error { - cmName := ConfigMapName(agentName) - - cm := &corev1.ConfigMap{} - key := types.NamespacedName{Name: cmName, Namespace: namespace} - err := w.reader.Get(ctx, key, cm) - - if apierrors.IsNotFound(err) { - cm = &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: cmName, - Namespace: namespace, - }, - Data: map[string]string{ - SignedCardConfigMapKey: string(signedCard), - }, - } - if err := controllerutil.SetControllerReference(owner, cm, w.scheme); err != nil { - return fmt.Errorf("failed to set owner reference on ConfigMap %s: %w", cmName, err) - } - if err := w.client.Create(ctx, cm); err != nil { - return fmt.Errorf("failed to create ConfigMap %s: %w", cmName, err) - } - writerLogger.Info("Created signed card ConfigMap", "configMap", cmName, "namespace", namespace) - return nil - } else if err != nil { - return fmt.Errorf("failed to get ConfigMap %s: %w", cmName, err) - } - - cm.Data = map[string]string{ - SignedCardConfigMapKey: string(signedCard), - } - if err := controllerutil.SetControllerReference(owner, cm, w.scheme); err != nil { - writerLogger.V(1).Info("Could not set owner reference on existing ConfigMap (may already be owned)", - "configMap", cmName, "error", err.Error()) - } - if err := w.client.Update(ctx, cm); err != nil { - return fmt.Errorf("failed to update ConfigMap %s: %w", cmName, err) - } - writerLogger.Info("Updated signed card ConfigMap", "configMap", cmName, "namespace", namespace) - return nil -} diff --git a/kagenti-operator/internal/agentcard/writer_test.go b/kagenti-operator/internal/agentcard/writer_test.go deleted file mode 100644 index f787b514..00000000 --- a/kagenti-operator/internal/agentcard/writer_test.go +++ /dev/null @@ -1,140 +0,0 @@ -/* -Copyright 2025. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package agentcard - -import ( - "context" - "testing" - - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "sigs.k8s.io/controller-runtime/pkg/client/fake" - - agentv1alpha1 "github.com/kagenti/operator/api/v1alpha1" -) - -func testScheme() *runtime.Scheme { - s := runtime.NewScheme() - utilruntime.Must(corev1.AddToScheme(s)) - utilruntime.Must(agentv1alpha1.AddToScheme(s)) - return s -} - -func testAgentCard() *agentv1alpha1.AgentCard { - return &agentv1alpha1.AgentCard{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-card", - Namespace: "test-ns", - UID: "test-uid-123", - }, - } -} - -func TestWriter_CreateConfigMap(t *testing.T) { - scheme := testScheme() - client := fake.NewClientBuilder().WithScheme(scheme).Build() - writer := NewWriter(client, client, scheme) - - owner := testAgentCard() - signedCard := []byte(`{"name":"test-agent","signatures":[{"protected":"xxx","signature":"yyy"}]}`) - - err := writer.WriteSignedCard(context.Background(), owner, "my-agent", "test-ns", signedCard) - if err != nil { - t.Fatalf("WriteSignedCard failed: %v", err) - } - - cm := &corev1.ConfigMap{} - err = client.Get(context.Background(), types.NamespacedName{ - Name: "my-agent-card-signed", - Namespace: "test-ns", - }, cm) - if err != nil { - t.Fatalf("ConfigMap not created: %v", err) - } - - if cm.Data[SignedCardConfigMapKey] != string(signedCard) { - t.Errorf("ConfigMap data mismatch: got %q", cm.Data[SignedCardConfigMapKey]) - } -} - -func TestWriter_UpdateConfigMap(t *testing.T) { - scheme := testScheme() - existing := &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: "my-agent-card-signed", - Namespace: "test-ns", - }, - Data: map[string]string{SignedCardConfigMapKey: `{"name":"old"}`}, - } - client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existing).Build() - writer := NewWriter(client, client, scheme) - - owner := testAgentCard() - newCard := []byte(`{"name":"updated"}`) - - err := writer.WriteSignedCard(context.Background(), owner, "my-agent", "test-ns", newCard) - if err != nil { - t.Fatalf("WriteSignedCard update failed: %v", err) - } - - cm := &corev1.ConfigMap{} - if err = client.Get(context.Background(), types.NamespacedName{ - Name: "my-agent-card-signed", - Namespace: "test-ns", - }, cm); err != nil { - t.Fatalf("ConfigMap not found after update: %v", err) - } - - if cm.Data[SignedCardConfigMapKey] != string(newCard) { - t.Errorf("ConfigMap not updated: got %q", cm.Data[SignedCardConfigMapKey]) - } -} - -func TestWriter_OwnerReference(t *testing.T) { - scheme := testScheme() - client := fake.NewClientBuilder().WithScheme(scheme).Build() - writer := NewWriter(client, client, scheme) - - owner := testAgentCard() - err := writer.WriteSignedCard(context.Background(), owner, "my-agent", "test-ns", []byte(`{}`)) - if err != nil { - t.Fatalf("WriteSignedCard failed: %v", err) - } - - cm := &corev1.ConfigMap{} - if err = client.Get(context.Background(), types.NamespacedName{ - Name: "my-agent-card-signed", - Namespace: "test-ns", - }, cm); err != nil { - t.Fatalf("ConfigMap not found: %v", err) - } - - if len(cm.OwnerReferences) == 0 { - t.Error("expected owner reference on ConfigMap") - } else { - ownerRef := cm.OwnerReferences[0] - if ownerRef.Name != "test-card" { - t.Errorf("expected owner name 'test-card', got %q", ownerRef.Name) - } - if ownerRef.UID != "test-uid-123" { - t.Errorf("expected owner UID 'test-uid-123', got %q", ownerRef.UID) - } - } -} diff --git a/kagenti-operator/internal/signature/signer.go b/kagenti-operator/internal/signature/signer.go index 02c240cb..a70752b6 100644 --- a/kagenti-operator/internal/signature/signer.go +++ b/kagenti-operator/internal/signature/signer.go @@ -17,7 +17,6 @@ limitations under the License. package signature import ( - "context" "crypto" "crypto/ecdsa" "crypto/elliptic" @@ -32,17 +31,8 @@ import ( agentv1alpha1 "github.com/kagenti/operator/api/v1alpha1" ) -// Signer produces JWS x5c signatures on AgentCard data using a SPIFFE identity. -type Signer interface { - SignCard(ctx context.Context, cardData *agentv1alpha1.AgentCardData) ([]byte, error) - SpiffeID() string - Ready() bool - Close() error -} - // SignCard signs AgentCard data with the given private key and certificate chain, -// producing a JWS x5c signed JSON output. This is the shared signing logic used -// by both the operator and the init-container. +// producing a JWS x5c signed JSON output. Used by the agentcard-signer init-container. func SignCard(cardData *agentv1alpha1.AgentCardData, privateKey crypto.Signer, certs []*x509.Certificate) ([]byte, error) { if cardData == nil { return nil, fmt.Errorf("card data is nil") diff --git a/kagenti-operator/internal/signature/spiffe_signer.go b/kagenti-operator/internal/signature/spiffe_signer.go deleted file mode 100644 index cdf6c29c..00000000 --- a/kagenti-operator/internal/signature/spiffe_signer.go +++ /dev/null @@ -1,129 +0,0 @@ -/* -Copyright 2025. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package signature - -import ( - "context" - "fmt" - "sync" - - "github.com/spiffe/go-spiffe/v2/svid/x509svid" - "github.com/spiffe/go-spiffe/v2/workloadapi" - - agentv1alpha1 "github.com/kagenti/operator/api/v1alpha1" -) - -// X509SVIDSource abstracts the SPIFFE X509Source for testability. -type X509SVIDSource interface { - GetX509SVID() (*x509svid.SVID, error) - Close() error -} - -// SpiffeSigner implements the Signer interface using a SPIFFE X509Source -// that auto-rotates SVIDs as they approach expiry. -type SpiffeSigner struct { - source X509SVIDSource - mu sync.RWMutex - closed bool -} - -// NewSpiffeSigner creates a SpiffeSigner backed by the SPIFFE Workload API. -// The X509Source automatically rotates SVIDs before expiry. -func NewSpiffeSigner(ctx context.Context, socketPath string) (*SpiffeSigner, error) { - source, err := workloadapi.NewX509Source(ctx, workloadapi.WithClientOptions(workloadapi.WithAddr(socketPath))) - if err != nil { - return nil, fmt.Errorf("failed to create X509Source: %w", err) - } - - return &SpiffeSigner{ - source: source, - }, nil -} - -// NewSpiffeSignerWithSource creates a SpiffeSigner with a custom X509SVIDSource (for testing). -func NewSpiffeSignerWithSource(source X509SVIDSource) *SpiffeSigner { - return &SpiffeSigner{ - source: source, - } -} - -// SignCard signs the AgentCard data using the current SVID from the X509Source. -func (s *SpiffeSigner) SignCard(ctx context.Context, cardData *agentv1alpha1.AgentCardData) ([]byte, error) { - s.mu.RLock() - defer s.mu.RUnlock() - - if s.closed { - return nil, fmt.Errorf("signer is closed") - } - - svid, err := s.source.GetX509SVID() - if err != nil { - return nil, fmt.Errorf("failed to get X509-SVID: %w", err) - } - - signed, err := SignCard(cardData, svid.PrivateKey, svid.Certificates) - if err != nil { - return nil, fmt.Errorf("signing failed: %w", err) - } - - // Do NOT zero svid.PrivateKey here: the X509Source owns the SVID and - // returns a shared reference. Zeroing it would corrupt future calls. - return signed, nil -} - -// SpiffeID returns the SPIFFE ID from the current SVID, or empty string if unavailable. -func (s *SpiffeSigner) SpiffeID() string { - s.mu.RLock() - defer s.mu.RUnlock() - - if s.closed { - return "" - } - - svid, err := s.source.GetX509SVID() - if err != nil { - return "" - } - - return svid.ID.String() -} - -// Ready returns true when the signer has obtained at least one SVID. -func (s *SpiffeSigner) Ready() bool { - s.mu.RLock() - defer s.mu.RUnlock() - - if s.closed { - return false - } - - _, err := s.source.GetX509SVID() - return err == nil -} - -// Close shuts down the X509Source and releases resources. -func (s *SpiffeSigner) Close() error { - s.mu.Lock() - defer s.mu.Unlock() - - if s.closed { - return nil - } - - s.closed = true - return s.source.Close() -} diff --git a/kagenti-operator/internal/signature/spiffe_signer_test.go b/kagenti-operator/internal/signature/spiffe_signer_test.go deleted file mode 100644 index 31c8aba4..00000000 --- a/kagenti-operator/internal/signature/spiffe_signer_test.go +++ /dev/null @@ -1,161 +0,0 @@ -/* -Copyright 2025. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package signature - -import ( - "context" - "crypto/ecdsa" - "crypto/elliptic" - "crypto/rand" - "crypto/x509" - "fmt" - "testing" - - "github.com/spiffe/go-spiffe/v2/spiffeid" - "github.com/spiffe/go-spiffe/v2/svid/x509svid" -) - -type mockX509Source struct { - svid *x509svid.SVID - err error - closed bool - callCnt int -} - -func (m *mockX509Source) GetX509SVID() (*x509svid.SVID, error) { - m.callCnt++ - if m.err != nil { - return nil, m.err - } - return m.svid, nil -} - -func (m *mockX509Source) Close() error { - m.closed = true - return nil -} - -func newMockSVID(t *testing.T, ca *testCA, spiffeIDStr string) *x509svid.SVID { - t.Helper() - key, _ := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) - leaf := ca.issueLeaf(t, key, leafOpts{ - spiffeIDs: []string{spiffeIDStr}, - }) - td, _ := spiffeid.TrustDomainFromString("example.org") - id, _ := spiffeid.FromPath(td, "/ns/default/sa/test") - return &x509svid.SVID{ - ID: id, - Certificates: []*x509.Certificate{leaf, ca.Cert}, - PrivateKey: key, - } -} - -func TestSpiffeSigner_SignCard_Success(t *testing.T) { - ca := newTestCA(t) - svid := newMockSVID(t, ca, "spiffe://example.org/ns/default/sa/test") - source := &mockX509Source{svid: svid} - - signer := NewSpiffeSignerWithSource(source) - defer signer.Close() //nolint:errcheck // test cleanup - - card := testCard() - output, err := signer.SignCard(context.Background(), card) - if err != nil { - t.Fatalf("SignCard failed: %v", err) - } - if len(output) == 0 { - t.Error("expected non-empty signed output") - } -} - -func TestSpiffeSigner_NotReady(t *testing.T) { - source := &mockX509Source{err: fmt.Errorf("no SVID available")} - signer := NewSpiffeSignerWithSource(source) - defer signer.Close() //nolint:errcheck // test cleanup - - if signer.Ready() { - t.Error("expected Ready()=false when source returns error") - } - - _, err := signer.SignCard(context.Background(), testCard()) - if err == nil { - t.Error("expected error when signer not ready") - } -} - -func TestSpiffeSigner_SpiffeID(t *testing.T) { - ca := newTestCA(t) - svid := newMockSVID(t, ca, "spiffe://example.org/ns/default/sa/test") - source := &mockX509Source{svid: svid} - - signer := NewSpiffeSignerWithSource(source) - defer signer.Close() //nolint:errcheck // test cleanup - - id := signer.SpiffeID() - if id != "spiffe://example.org/ns/default/sa/test" { - t.Errorf("expected spiffe://example.org/ns/default/sa/test, got %q", id) - } -} - -func TestSpiffeSigner_SVIDRotation(t *testing.T) { - ca := newTestCA(t) - svid1 := newMockSVID(t, ca, "spiffe://example.org/ns/default/sa/test") - source := &mockX509Source{svid: svid1} - - signer := NewSpiffeSignerWithSource(source) - defer signer.Close() //nolint:errcheck // test cleanup - - card := testCard() - output1, err := signer.SignCard(context.Background(), card) - if err != nil { - t.Fatalf("first SignCard failed: %v", err) - } - - svid2 := newMockSVID(t, ca, "spiffe://example.org/ns/default/sa/test") - source.svid = svid2 - - card2 := testCard() - output2, err := signer.SignCard(context.Background(), card2) - if err != nil { - t.Fatalf("second SignCard failed: %v", err) - } - - if string(output1) == string(output2) { - t.Error("expected different signatures after SVID rotation (different key)") - } -} - -func TestSpiffeSigner_Close(t *testing.T) { - source := &mockX509Source{err: fmt.Errorf("not ready")} - signer := NewSpiffeSignerWithSource(source) - - if err := signer.Close(); err != nil { - t.Fatalf("Close failed: %v", err) - } - if !source.closed { - t.Error("expected source to be closed") - } - - if signer.Ready() { - t.Error("expected Ready()=false after close") - } - - _, err := signer.SignCard(context.Background(), testCard()) - if err == nil { - t.Error("expected error after close") - } -} From e83627ac7bcd1d3e56aa5aafdbdb0aa49686014b Mon Sep 17 00:00:00 2001 From: Kevin Cogan Date: Thu, 7 May 2026 22:38:10 +0100 Subject: [PATCH 10/16] feat: implement mTLS verified fetch for AgentCard trust Adds Phase 1 identity verification: the operator authenticates agents via mutual TLS using SPIFFE workload identity before trusting their AgentCard data. - SpiffeFetcher performs mTLS-authenticated fetch with cached HTTP client - evaluateTrust unifies mTLS and JWS signature verification paths - Verified condition is the single trust signal for NetworkPolicy gating - Identity binding evaluates attested SPIFFE IDs against trust domain - AttestedAgent printer column shows verified SPIFFE ID (wide output) - Proper gRPC connection lifecycle for X509Source Signed-off-by: Kevin Cogan --- .../api/v1alpha1/agentcard_types.go | 8 +- kagenti-operator/cmd/main.go | 72 ++- .../bases/agent.kagenti.dev_agentcards.yaml | 16 +- .../internal/agentcard/fetcher.go | 179 +++++- .../agentcard/fetcher_verified_test.go | 80 +++ .../controller/agentcard_controller.go | 559 ++++++++++++------ .../controller/agentcard_controller_test.go | 225 ------- .../controller/identity_binding_test.go | 18 +- 8 files changed, 692 insertions(+), 465 deletions(-) create mode 100644 kagenti-operator/internal/agentcard/fetcher_verified_test.go diff --git a/kagenti-operator/api/v1alpha1/agentcard_types.go b/kagenti-operator/api/v1alpha1/agentcard_types.go index fb7cfc34..39c63973 100644 --- a/kagenti-operator/api/v1alpha1/agentcard_types.go +++ b/kagenti-operator/api/v1alpha1/agentcard_types.go @@ -119,6 +119,11 @@ type AgentCardStatus struct { // +optional CardId string `json:"cardId,omitempty"` + // AttestedAgentSpiffeID is the SPIFFE ID extracted from the agent's TLS peer certificate + // during authenticated (mTLS) fetch. Set only when verifiedFetch is enabled and successful. + // +optional + AttestedAgentSpiffeID string `json:"attestedAgentSpiffeId,omitempty"` + // ExpectedSpiffeID is the SPIFFE ID used for binding evaluation. // +optional ExpectedSpiffeID string `json:"expectedSpiffeID,omitempty"` @@ -342,11 +347,12 @@ type SkillParameter struct { // +kubebuilder:printcolumn:name="Kind",type="string",JSONPath=".status.targetRef.kind",description="Workload Kind" // +kubebuilder:printcolumn:name="Target",type="string",JSONPath=".status.targetRef.name",description="Target Workload" // +kubebuilder:printcolumn:name="Agent",type="string",JSONPath=".status.card.name",description="Agent Name" -// +kubebuilder:printcolumn:name="Verified",type="boolean",JSONPath=".status.validSignature",description="Signature Verified" +// +kubebuilder:printcolumn:name="Verified",type="string",JSONPath=".status.conditions[?(@.type=='Verified')].status",description="Identity Verified" // +kubebuilder:printcolumn:name="Bound",type="boolean",JSONPath=".status.bindingStatus.bound",description="Identity Bound" // +kubebuilder:printcolumn:name="Synced",type="string",JSONPath=".status.conditions[?(@.type=='Synced')].status",description="Sync Status" // +kubebuilder:printcolumn:name="LastSync",type="date",JSONPath=".status.lastSyncTime",description="Last Sync Time" // +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" +// +kubebuilder:printcolumn:name="AttestedAgent",type="string",JSONPath=".status.attestedAgentSpiffeId",description="Attested Agent SPIFFE ID",priority=1 // AgentCard is the Schema for the agentcards API. type AgentCard struct { diff --git a/kagenti-operator/cmd/main.go b/kagenti-operator/cmd/main.go index c2c5ba54..c152f15d 100644 --- a/kagenti-operator/cmd/main.go +++ b/kagenti-operator/cmd/main.go @@ -42,6 +42,8 @@ import ( metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/webhook" + "github.com/spiffe/go-spiffe/v2/workloadapi" + agentv1alpha1 "github.com/kagenti/operator/api/v1alpha1" "github.com/kagenti/operator/internal/agentcard" "github.com/kagenti/operator/internal/controller" @@ -89,8 +91,8 @@ func main() { var enableOperatorClientRegistration bool var enableMLflow bool - var enableOperatorSigning bool - var spiffeEndpointSocket string + var enableVerifiedFetch bool + var verifiedFetchSpiffeSocket string var spireTrustDomain string var spireTrustBundleConfigMapName string @@ -133,10 +135,11 @@ func main() { flag.BoolVar(&enableMLflow, "enable-mlflow", false, "Enable MLflow experiment tracking integration") - flag.BoolVar(&enableOperatorSigning, "enable-operator-signing", false, - "Operator signs agent cards with its own SPIFFE identity instead of the init-container") - flag.StringVar(&spiffeEndpointSocket, "spiffe-endpoint-socket", "unix:///spiffe-workload-api/spire-agent.sock", - "SPIFFE Workload API socket path for operator signing") + flag.BoolVar(&enableVerifiedFetch, "enable-verified-fetch", false, + "Enable mTLS-authenticated fetch of agent cards via SPIFFE identity") + flag.StringVar(&verifiedFetchSpiffeSocket, "verified-fetch-spiffe-socket", + "unix:///spiffe-workload-api/spire-agent.sock", + "SPIFFE Workload API socket path for verified fetch") flag.StringVar(&spireTrustDomain, "spire-trust-domain", "", "SPIRE trust domain for identity binding (e.g. 'example.org')") @@ -287,10 +290,10 @@ func main() { os.Exit(1) } - if !requireA2ASignature { - setupLog.Info("WARNING: --require-a2a-signature is false. Identity binding requires " + - "--require-a2a-signature=true to function. AgentCards with spec.identityBinding " + - "will always report NotBound.") + if !requireA2ASignature && !enableVerifiedFetch { + setupLog.Info("WARNING: Neither --require-a2a-signature nor --enable-verified-fetch is set. " + + "Identity binding requires at least one trust mechanism to function. " + + "AgentCards with spec.identityBinding will report NotBound.") } var sigProvider signature.Provider @@ -329,8 +332,34 @@ func main() { } agentFetcher := agentcard.NewConfigMapFetcher(mgr.GetAPIReader()) - if enableOperatorSigning { - agentFetcher = agentcard.NewFetcher() + + var authenticatedFetcher agentcard.AuthenticatedFetcher + if enableVerifiedFetch { + fetchCtx, fetchCancel := context.WithTimeout(ctx, 30*time.Second) + fetchX509Source, fetchSourceErr := workloadapi.NewX509Source( + fetchCtx, + workloadapi.WithClientOptions(workloadapi.WithAddr(verifiedFetchSpiffeSocket)), + ) + fetchCancel() + + if fetchSourceErr != nil { + setupLog.Info("WARNING: SPIRE unavailable for verified fetch, falling back to default fetcher", + "error", fetchSourceErr.Error(), + "socket", verifiedFetchSpiffeSocket) + enableVerifiedFetch = false + } else { + td := spireTrustDomain + if td == "" { + setupLog.Error(errors.New("missing required flag"), + "--spire-trust-domain is required when --enable-verified-fetch=true") + os.Exit(1) + } + authenticatedFetcher = agentcard.NewSpiffeFetcher(fetchX509Source, td) + defer fetchX509Source.Close() //nolint:errcheck + setupLog.Info("Verified fetch enabled (mTLS via SPIFFE)", + "socket", verifiedFetchSpiffeSocket, + "trustDomain", td) + } } agentCardReconciler := &controller.AgentCardReconciler{ @@ -338,25 +367,13 @@ func main() { Scheme: mgr.GetScheme(), Recorder: mgr.GetEventRecorderFor("agentcard-controller"), AgentFetcher: agentFetcher, + AuthenticatedFetcher: authenticatedFetcher, + EnableVerifiedFetch: enableVerifiedFetch, SignatureProvider: sigProvider, RequireSignature: requireA2ASignature, SignatureAuditMode: signatureAuditMode, SpireTrustDomain: spireTrustDomain, SVIDExpiryGracePeriod: svidExpiryGracePeriod, - EnableOperatorSign: enableOperatorSigning, - } - - if enableOperatorSigning { - cardSigner, signerErr := signature.NewSpiffeSigner(ctx, spiffeEndpointSocket) - if signerErr != nil { - setupLog.Error(signerErr, "unable to create SPIFFE signer for operator signing") - os.Exit(1) - } - defer cardSigner.Close() //nolint:errcheck // best-effort cleanup on shutdown - agentCardReconciler.CardSigner = cardSigner - agentCardReconciler.CardWriter = agentcard.NewWriter(mgr.GetClient(), mgr.GetAPIReader(), mgr.GetScheme()) - setupLog.Info("Operator AgentCard signing enabled", - "spiffeEndpointSocket", spiffeEndpointSocket) } if err = agentCardReconciler.SetupWithManager(mgr); err != nil { @@ -369,6 +386,7 @@ func main() { Client: mgr.GetClient(), Scheme: mgr.GetScheme(), EnforceNetworkPolicies: enforceNetworkPolicies, + OperatorNamespace: os.Getenv("POD_NAMESPACE"), } npReconciler.DiscoverKubeAPIServerCIDRs( context.Background(), mgr.GetAPIReader(), @@ -377,7 +395,7 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "AgentCardNetworkPolicy") os.Exit(1) } - setupLog.Info("Network policy enforcement enabled for signature verification") + setupLog.Info("Network policy enforcement enabled for identity verification") } if err = (&controller.AgentCardSyncReconciler{ diff --git a/kagenti-operator/config/crd/bases/agent.kagenti.dev_agentcards.yaml b/kagenti-operator/config/crd/bases/agent.kagenti.dev_agentcards.yaml index 21d35447..e2315ef7 100644 --- a/kagenti-operator/config/crd/bases/agent.kagenti.dev_agentcards.yaml +++ b/kagenti-operator/config/crd/bases/agent.kagenti.dev_agentcards.yaml @@ -34,10 +34,10 @@ spec: jsonPath: .status.card.name name: Agent type: string - - description: Signature Verified - jsonPath: .status.validSignature + - description: Identity Verified + jsonPath: .status.conditions[?(@.type=='Verified')].status name: Verified - type: boolean + type: string - description: Identity Bound jsonPath: .status.bindingStatus.bound name: Bound @@ -53,6 +53,11 @@ spec: - jsonPath: .metadata.creationTimestamp name: Age type: date + - description: Attested Agent SPIFFE ID + jsonPath: .status.attestedAgentSpiffeId + name: AttestedAgent + priority: 1 + type: string name: v1alpha1 schema: openAPIV3Schema: @@ -129,6 +134,11 @@ spec: status: description: AgentCardStatus defines the observed state of AgentCard. properties: + attestedAgentSpiffeId: + description: |- + AttestedAgentSpiffeID is the SPIFFE ID extracted from the agent's TLS peer certificate + during authenticated (mTLS) fetch. Set only when verifiedFetch is enabled and successful. + type: string bindingStatus: description: BindingStatus contains the result of identity binding evaluation diff --git a/kagenti-operator/internal/agentcard/fetcher.go b/kagenti-operator/internal/agentcard/fetcher.go index f9476159..68dd003a 100644 --- a/kagenti-operator/internal/agentcard/fetcher.go +++ b/kagenti-operator/internal/agentcard/fetcher.go @@ -18,13 +18,19 @@ package agentcard import ( "context" + "crypto/tls" + "crypto/x509" "encoding/json" "errors" "fmt" "io" "net/http" + "net/url" "time" + "github.com/spiffe/go-spiffe/v2/spiffeid" + "github.com/spiffe/go-spiffe/v2/spiffetls/tlsconfig" + "github.com/spiffe/go-spiffe/v2/workloadapi" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" @@ -48,6 +54,11 @@ const ( SignedCardConfigMapKey = "agent-card.json" ) +// ConfigMapName returns the expected ConfigMap name for a signed agent card. +func ConfigMapName(agentName string) string { + return agentName + SignedCardConfigMapSuffix +} + type Fetcher interface { Fetch(ctx context.Context, protocol, serviceURL, agentName, namespace string, ) (*agentv1alpha1.AgentCardData, error) @@ -108,36 +119,51 @@ func (f *DefaultFetcher) fetchA2ACard(ctx context.Context, serviceURL string) (* // errNotFound is returned when the agent card endpoint returns HTTP 404. var errNotFound = errors.New("agent card not found") -func (f *DefaultFetcher) fetchAgentCardFromPath(ctx context.Context, serviceURL, path string) (*agentv1alpha1.AgentCardData, error) { - agentCardURL := serviceURL + path - fetcherLogger.Info("Fetching A2A agent card", "url", agentCardURL) +// maxCardSize caps the response body read to prevent memory exhaustion. +const maxCardSize = 1 << 20 // 1 MiB - req, err := http.NewRequestWithContext(ctx, http.MethodGet, agentCardURL, nil) +// doHTTPFetch performs a GET request and returns the raw response body and TLS +// connection state. It handles 404 (errNotFound), non-200 errors, and limits +// the body read to maxCardSize. +func doHTTPFetch(ctx context.Context, httpClient *http.Client, fetchURL string) ([]byte, *tls.ConnectionState, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, fetchURL, nil) if err != nil { - return nil, fmt.Errorf("failed to create request: %w", err) + return nil, nil, fmt.Errorf("failed to create request: %w", err) } req.Header.Set("Accept", "application/json") - resp, err := f.httpClient.Do(req) + resp, err := httpClient.Do(req) if err != nil { - return nil, fmt.Errorf("failed to fetch agent card: %w", err) + return nil, nil, fmt.Errorf("failed to fetch agent card: %w", err) } defer func() { _ = resp.Body.Close() }() - const maxCardSize = 1 << 20 // 1 MiB - if resp.StatusCode == http.StatusNotFound { - return nil, errNotFound + return nil, nil, errNotFound } if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(io.LimitReader(resp.Body, maxCardSize)) - return nil, fmt.Errorf("unexpected status code %d: %s", resp.StatusCode, string(body)) + return nil, nil, fmt.Errorf("unexpected status code %d: %s", resp.StatusCode, string(body)) } body, err := io.ReadAll(io.LimitReader(resp.Body, maxCardSize)) if err != nil { - return nil, fmt.Errorf("failed to read response body: %w", err) + return nil, nil, fmt.Errorf("failed to read response body: %w", err) + } + + return body, resp.TLS, nil +} + +func (f *DefaultFetcher) fetchAgentCardFromPath( + ctx context.Context, serviceURL, path string, +) (*agentv1alpha1.AgentCardData, error) { + agentCardURL := serviceURL + path + fetcherLogger.Info("Fetching A2A agent card", "url", agentCardURL) + + body, _, err := doHTTPFetch(ctx, f.httpClient, agentCardURL) + if err != nil { + return nil, err } var agentCardData agentv1alpha1.AgentCardData @@ -198,3 +224,132 @@ func (f *ConfigMapFetcher) Fetch( func GetServiceURL(agentName, namespace string, port int32) string { return fmt.Sprintf("http://%s.%s.svc.cluster.local:%d", agentName, namespace, port) } + +// GetSecureServiceURL returns an HTTPS URL for the agent's TLS port. +func GetSecureServiceURL(agentName, namespace string, port int32) string { + return fmt.Sprintf("https://%s.%s.svc.cluster.local:%d", agentName, namespace, port) +} + +// FetchResult contains the result of an authenticated fetch including +// the agent's verified SPIFFE ID extracted from the TLS peer certificate. +type FetchResult struct { + CardData *agentv1alpha1.AgentCardData + AgentSpiffeID string +} + +// AuthenticatedFetcher performs mTLS-authenticated fetches and returns +// identity information from the TLS handshake alongside the card data. +type AuthenticatedFetcher interface { + FetchAuthenticated(ctx context.Context, protocol, serviceURL string) (*FetchResult, error) +} + +// SpiffeFetcher implements AuthenticatedFetcher using go-spiffe mTLS. +// It verifies the agent belongs to the configured trust domain and extracts +// the SPIFFE ID from the peer certificate. The HTTP client is reused across +// fetches for connection pooling; the TLS config dynamically reads the latest +// SVID from the X509Source on each handshake. +type SpiffeFetcher struct { + x509Source *workloadapi.X509Source + trustDomain string + httpClient *http.Client +} + +// NewSpiffeFetcher creates a SpiffeFetcher that uses the provided X509Source +// for mTLS and validates peers against the given trust domain. +func NewSpiffeFetcher(source *workloadapi.X509Source, trustDomain string) *SpiffeFetcher { + td, _ := spiffeid.TrustDomainFromString(trustDomain) + tlsCfg := tlsconfig.MTLSClientConfig(source, source, tlsconfig.AuthorizeMemberOf(td)) + return &SpiffeFetcher{ + x509Source: source, + trustDomain: trustDomain, + httpClient: &http.Client{ + Timeout: DefaultFetchTimeout, + Transport: &http.Transport{TLSClientConfig: tlsCfg}, + }, + } +} + +func (f *SpiffeFetcher) FetchAuthenticated(ctx context.Context, protocol, serviceURL string) (*FetchResult, error) { + switch protocol { + case A2AProtocol: + return f.fetchA2ACardAuthenticated(ctx, serviceURL) + default: + return nil, fmt.Errorf("unsupported protocol: %s", protocol) + } +} + +func (f *SpiffeFetcher) fetchA2ACardAuthenticated(ctx context.Context, serviceURL string) (*FetchResult, error) { + result, err := f.fetchAuthenticatedFromPath(ctx, serviceURL, A2AAgentCardPath) + if err == nil { + return result, nil + } + + if !errors.Is(err, errNotFound) { + return nil, err + } + + fetcherLogger.Info("Agent card not found at current endpoint, trying legacy endpoint (authenticated)", + "currentPath", A2AAgentCardPath, + "legacyPath", A2ALegacyAgentCardPath) + + result, legacyErr := f.fetchAuthenticatedFromPath(ctx, serviceURL, A2ALegacyAgentCardPath) + if legacyErr != nil { + return nil, err + } + + fetcherLogger.Info("Agent card served from deprecated endpoint (authenticated)", + "deprecated", true, + "migrateTo", A2AAgentCardPath, + "legacyPath", A2ALegacyAgentCardPath, + "agentName", result.CardData.Name) + + return result, nil +} + +func (f *SpiffeFetcher) fetchAuthenticatedFromPath(ctx context.Context, serviceURL, path string) (*FetchResult, error) { + agentCardURL := serviceURL + path + fetcherLogger.Info("Fetching A2A agent card (mTLS)", "url", agentCardURL) + + body, tlsState, err := doHTTPFetch(ctx, f.httpClient, agentCardURL) + if err != nil { + return nil, err + } + + var agentCardData agentv1alpha1.AgentCardData + if err := json.Unmarshal(body, &agentCardData); err != nil { + return nil, fmt.Errorf("failed to parse agent card JSON: %w", err) + } + + agentSpiffeID := extractSpiffeIDFromTLS(tlsState) + + fetcherLogger.Info("Successfully fetched agent card (mTLS)", + "name", agentCardData.Name, + "version", agentCardData.Version, + "agentSpiffeID", agentSpiffeID) + + return &FetchResult{ + CardData: &agentCardData, + AgentSpiffeID: agentSpiffeID, + }, nil +} + +// extractSpiffeIDFromTLS returns the SPIFFE ID from the peer certificate's URI SANs. +func extractSpiffeIDFromTLS(state *tls.ConnectionState) string { + if state == nil || len(state.PeerCertificates) == 0 { + return "" + } + return spiffeIDFromCert(state.PeerCertificates[0]) +} + +func spiffeIDFromCert(cert *x509.Certificate) string { + for _, uri := range cert.URIs { + parsed, err := url.Parse(uri.String()) + if err != nil { + continue + } + if parsed.Scheme == "spiffe" { + return uri.String() + } + } + return "" +} diff --git a/kagenti-operator/internal/agentcard/fetcher_verified_test.go b/kagenti-operator/internal/agentcard/fetcher_verified_test.go new file mode 100644 index 00000000..7e1cb7a3 --- /dev/null +++ b/kagenti-operator/internal/agentcard/fetcher_verified_test.go @@ -0,0 +1,80 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package agentcard + +import ( + "crypto/x509" + "net/url" + "testing" + + "github.com/onsi/gomega" +) + +func TestGetSecureServiceURL(t *testing.T) { + g := gomega.NewGomegaWithT(t) + g.Expect(GetSecureServiceURL("my-agent", "default", 8443)). + To(gomega.Equal("https://my-agent.default.svc.cluster.local:8443")) +} + +func TestSpiffeIDFromCert_ValidSPIFFE(t *testing.T) { + g := gomega.NewGomegaWithT(t) + spiffeURI, _ := url.Parse("spiffe://example.org/ns/team1/sa/weather-agent") + cert := &x509.Certificate{ + URIs: []*url.URL{spiffeURI}, + } + g.Expect(spiffeIDFromCert(cert)).To(gomega.Equal("spiffe://example.org/ns/team1/sa/weather-agent")) +} + +func TestSpiffeIDFromCert_NoURISANs(t *testing.T) { + g := gomega.NewGomegaWithT(t) + cert := &x509.Certificate{} + g.Expect(spiffeIDFromCert(cert)).To(gomega.BeEmpty()) +} + +func TestSpiffeIDFromCert_NonSpiffeURI(t *testing.T) { + g := gomega.NewGomegaWithT(t) + httpURI, _ := url.Parse("https://example.com/agent") + cert := &x509.Certificate{ + URIs: []*url.URL{httpURI}, + } + g.Expect(spiffeIDFromCert(cert)).To(gomega.BeEmpty()) +} + +func TestSpiffeIDFromCert_MultipleURIs(t *testing.T) { + g := gomega.NewGomegaWithT(t) + httpURI, _ := url.Parse("https://example.com/agent") + spiffeURI, _ := url.Parse("spiffe://trust.domain/workload") + cert := &x509.Certificate{ + URIs: []*url.URL{httpURI, spiffeURI}, + } + g.Expect(spiffeIDFromCert(cert)).To(gomega.Equal("spiffe://trust.domain/workload")) +} + +func TestNewSpiffeFetcher(t *testing.T) { + g := gomega.NewGomegaWithT(t) + fetcher := NewSpiffeFetcher(nil, "example.org") + g.Expect(fetcher).NotTo(gomega.BeNil()) + g.Expect(fetcher.trustDomain).To(gomega.Equal("example.org")) +} + +func TestSpiffeFetcher_UnsupportedProtocol(t *testing.T) { + g := gomega.NewGomegaWithT(t) + fetcher := NewSpiffeFetcher(nil, "example.org") + _, err := fetcher.FetchAuthenticated(t.Context(), "unsupported", "https://example.com") + g.Expect(err).To(gomega.HaveOccurred()) + g.Expect(err.Error()).To(gomega.ContainSubstring("unsupported protocol")) +} diff --git a/kagenti-operator/internal/controller/agentcard_controller.go b/kagenti-operator/internal/controller/agentcard_controller.go index f766086a..2c4d3161 100644 --- a/kagenti-operator/internal/controller/agentcard_controller.go +++ b/kagenti-operator/internal/controller/agentcard_controller.go @@ -121,6 +121,13 @@ type AgentCardReconciler struct { AgentFetcher agentcard.Fetcher + // AuthenticatedFetcher performs mTLS-authenticated fetches. + // Nil when verifiedFetch is disabled. + AuthenticatedFetcher agentcard.AuthenticatedFetcher + + // EnableVerifiedFetch gates the mTLS authenticated fetch code path. + EnableVerifiedFetch bool + SignatureProvider signature.Provider RequireSignature bool SignatureAuditMode bool @@ -131,16 +138,6 @@ type AgentCardReconciler struct { // SVIDExpiryGracePeriod controls how far before the leaf cert expires the operator // triggers a proactive workload restart. Defaults to DefaultSVIDExpiryGracePeriod. SVIDExpiryGracePeriod time.Duration - - // CardSigner signs agent cards with the operator's SPIFFE identity. - // Nil when operator signing is disabled. - CardSigner signature.Signer - - // EnableOperatorSign gates the operator signing code path. - EnableOperatorSign bool - - // CardWriter writes signed cards to ConfigMaps. Nil when operator signing is disabled. - CardWriter *agentcard.Writer } // +kubebuilder:rbac:groups=agent.kagenti.dev,resources=agentcards,verbs=get;list;watch;create;update;patch;delete @@ -235,62 +232,171 @@ func (r *AgentCardReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( servicePort := r.getServicePort(service) serviceURL := agentcard.GetServiceURL(workload.ServiceName, agentCard.Namespace, servicePort) - cardData, err := r.AgentFetcher.Fetch(ctx, protocol, serviceURL, workload.ServiceName, agentCard.Namespace) + cardData, fetchResult, err := r.fetchCardData(ctx, agentCard, protocol, serviceURL, workload, service) if err != nil { - agentCardLogger.Error(err, "Failed to fetch agent card", "workload", workload.Name, "url", serviceURL) - if condErr := r.updateCondition(ctx, agentCard, "Synced", metav1.ConditionFalse, "FetchFailed", err.Error()); condErr != nil { + if condErr := r.updateCondition(ctx, agentCard, + "Ready", metav1.ConditionFalse, "SyncFailed", "Agent card fetch failed"); condErr != nil { return ctrl.Result{}, condErr } return ctrl.Result{RequeueAfter: 1 * time.Minute}, nil } - if r.EnableOperatorSign && r.CardSigner != nil { - if !r.CardSigner.Ready() { - agentCardLogger.Info("Operator signer not ready, requeueing", "workload", workload.Name) - if condErr := r.updateCondition(ctx, agentCard, "Synced", metav1.ConditionFalse, "SignerNotReady", - "Operator SPIFFE signer has not yet obtained an SVID"); condErr != nil { - return ctrl.Result{}, condErr - } - return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + trust := r.evaluateTrust(ctx, agentCard, cardData, fetchResult, workload) + + cardData.URL = serviceURL + cardID := r.computeCardID(cardData) + if cardID != "" && agentCard.Status.CardId != "" && agentCard.Status.CardId != cardID { + if r.Recorder != nil { + r.Recorder.Event(agentCard, corev1.EventTypeWarning, "CardContentChanged", + fmt.Sprintf("Agent card content changed: previous=%s, current=%s", + agentCard.Status.CardId, cardID)) } + agentCardLogger.Info("Card content changed", + "agentCard", agentCard.Name, + "previousCardId", agentCard.Status.CardId, + "newCardId", cardID) + } - signedBytes, signErr := r.CardSigner.SignCard(ctx, cardData) - if signErr != nil { - agentCardLogger.Error(signErr, "Operator signing failed", "workload", workload.Name) - if condErr := r.updateCondition(ctx, agentCard, "Synced", metav1.ConditionFalse, "SigningFailed", signErr.Error()); condErr != nil { - return ctrl.Result{}, condErr - } - return ctrl.Result{RequeueAfter: 30 * time.Second}, nil + resolvedTargetRef := &agentv1alpha1.TargetRef{ + APIVersion: workload.APIVersion, + Kind: workload.Kind, + Name: workload.Name, + } + + if err := r.updateAgentCardStatus(ctx, agentCard, cardData, protocol, cardID, + resolvedTargetRef, trust.verificationResult, trust.binding, trust.identityMatch, + trust.isMTLSVerified, trust.verifiedReason, trust.verifiedStatus, + trust.verifiedMessage, trust.attestedSpiffeID); err != nil { + agentCardLogger.Error(err, "Failed to update AgentCard status") + return ctrl.Result{}, err + } + + // Label propagation: only touches workload labels, not AgentCard status. + if trust.isMTLSVerified { + if err := r.propagateVerifiedLabel(ctx, agentCard, workload, true); err != nil { + agentCardLogger.Error(err, "Failed to propagate verified label to workload", + "workload", workload.Name, "verified", true) + return ctrl.Result{}, err + } + } else if r.RequireSignature { + if err := r.propagateSignatureLabel(ctx, agentCard.Name, workload, trust.isVerified); err != nil { + agentCardLogger.Error(err, "Failed to propagate signature label to workload", + "workload", workload.Name, "verified", trust.isVerified) + return ctrl.Result{}, err + } + if trust.verificationResult != nil && !trust.verificationResult.Verified && !r.SignatureAuditMode { + agentCardLogger.Info("Signature verification failed, rejecting agent card", + "workload", workload.Name, + "details", trust.verificationResult.Details) + return ctrl.Result{RequeueAfter: 1 * time.Minute}, nil } + } else if r.EnableVerifiedFetch && r.AuthenticatedFetcher != nil { + if err := r.propagateVerifiedLabel(ctx, agentCard, workload, false); err != nil { + agentCardLogger.Error(err, "Failed to propagate verified label to workload", + "workload", workload.Name, "verified", false) + return ctrl.Result{}, err + } + } - if r.CardWriter != nil { - if writeErr := r.CardWriter.WriteSignedCard(ctx, agentCard, workload.ServiceName, agentCard.Namespace, signedBytes); writeErr != nil { - agentCardLogger.Error(writeErr, "Failed to write signed card ConfigMap (non-fatal)", "workload", workload.Name) - if r.Recorder != nil { - r.Recorder.Event(agentCard, corev1.EventTypeWarning, "ConfigMapWriteFailed", writeErr.Error()) + syncPeriod := r.getSyncPeriod(agentCard) + agentCardLogger.V(1).Info("Successfully synced agent card", + "workload", workload.Name, "kind", workload.Kind, "nextSync", syncPeriod) + + return ctrl.Result{RequeueAfter: syncPeriod}, nil +} + +// trustEvaluation holds the computed trust state for an AgentCard reconcile. +type trustEvaluation struct { + verificationResult *signature.VerificationResult + binding *bindingResult + identityMatch *bool + isMTLSVerified bool + isVerified bool + verifiedStatus metav1.ConditionStatus + verifiedReason string + verifiedMessage string + attestedSpiffeID string +} + +// fetchCardData retrieves the agent card, choosing mTLS or plain HTTP based on +// configuration and service port availability. Returns an error only if the +// fetch fails — error conditions are already written to the AgentCard status. +func (r *AgentCardReconciler) fetchCardData( + ctx context.Context, agentCard *agentv1alpha1.AgentCard, + protocol, serviceURL string, workload *WorkloadInfo, service *corev1.Service, +) (*agentv1alpha1.AgentCardData, *agentcard.FetchResult, error) { + if r.EnableVerifiedFetch && r.AuthenticatedFetcher != nil { + tlsPort := r.getAgentTLSPort(service) + if tlsPort > 0 { + secureURL := agentcard.GetSecureServiceURL( + workload.ServiceName, agentCard.Namespace, tlsPort) + fetchResult, err := r.AuthenticatedFetcher.FetchAuthenticated(ctx, protocol, secureURL) + if err != nil { + agentCardLogger.Error(err, "Authenticated fetch failed", + "workload", workload.Name, "url", secureURL) + if condErr := r.updateCondition(ctx, agentCard, + "Synced", metav1.ConditionFalse, "FetchFailed", err.Error()); condErr != nil { + return nil, nil, condErr + } + if condErr := r.setVerifiedCondition(ctx, agentCard, + metav1.ConditionFalse, "FetchFailed", err.Error()); condErr != nil { + return nil, nil, condErr } + return nil, nil, err + } + return fetchResult.CardData, fetchResult, nil + } + agentCardLogger.Info("TLS port not found on service, falling back to HTTP fetch", + "service", workload.ServiceName, "expectedPortName", AgentTLSPortName) + cardData, err := r.AgentFetcher.Fetch( + ctx, protocol, serviceURL, workload.ServiceName, agentCard.Namespace) + if err != nil { + agentCardLogger.Error(err, "Failed to fetch agent card", + "workload", workload.Name, "url", serviceURL) + if condErr := r.updateCondition(ctx, agentCard, + "Synced", metav1.ConditionFalse, "FetchFailed", err.Error()); condErr != nil { + return nil, nil, condErr } + return nil, nil, err } + return cardData, nil, nil + } - agentCardLogger.Info("Operator signed agent card", - "workload", workload.Name, - "spiffeID", r.CardSigner.SpiffeID()) + cardData, err := r.AgentFetcher.Fetch( + ctx, protocol, serviceURL, workload.ServiceName, agentCard.Namespace) + if err != nil { + agentCardLogger.Error(err, "Failed to fetch agent card", + "workload", workload.Name, "url", serviceURL) + if condErr := r.updateCondition(ctx, agentCard, + "Synced", metav1.ConditionFalse, "FetchFailed", err.Error()); condErr != nil { + return nil, nil, condErr + } + return nil, nil, err } + r.cleanupVerifiedFetchFields(ctx, agentCard) + return cardData, nil, nil +} + +// evaluateTrust performs signature verification, binding computation, and +// determines the final Verified status for an AgentCard. +func (r *AgentCardReconciler) evaluateTrust( //nolint:gocyclo + ctx context.Context, agentCard *agentv1alpha1.AgentCard, + cardData *agentv1alpha1.AgentCardData, fetchResult *agentcard.FetchResult, + workload *WorkloadInfo, +) *trustEvaluation { + eval := &trustEvaluation{} - var verificationResult *signature.VerificationResult if r.RequireSignature { var verifyErr error - verificationResult, verifyErr = r.verifySignature(ctx, cardData) - + eval.verificationResult, verifyErr = r.verifySignature(ctx, cardData) if verifyErr != nil { agentCardLogger.Error(verifyErr, "Signature verification error", "workload", workload.Name) } - - if verificationResult != nil { - if verificationResult.Verified { + if eval.verificationResult != nil { + if eval.verificationResult.Verified { if r.Recorder != nil { r.Recorder.Event(agentCard, corev1.EventTypeNormal, "SignatureEvaluated", - fmt.Sprintf("Signature verified successfully (keyID=%s)", verificationResult.KeyID)) + fmt.Sprintf("Signature verified successfully (keyID=%s)", eval.verificationResult.KeyID)) } } else { reason := ReasonSignatureInvalid @@ -300,84 +406,69 @@ func (r *AgentCardReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( agentCardLogger.Info("Signature verification failed", "workload", workload.Name, "reason", reason, - "details", verificationResult.Details) + "details", eval.verificationResult.Details) if r.Recorder != nil { - r.Recorder.Event(agentCard, corev1.EventTypeWarning, "SignatureFailed", verificationResult.Details) + r.Recorder.Event(agentCard, corev1.EventTypeWarning, "SignatureFailed", eval.verificationResult.Details) } } } } - if r.RequireSignature && verificationResult != nil && verificationResult.Verified { - r.maybeRestartForResign(ctx, agentCard, workload, verificationResult) - } - - cardData.URL = serviceURL - - cardId := r.computeCardId(cardData) - if cardId != "" && agentCard.Status.CardId != "" && agentCard.Status.CardId != cardId { - if r.Recorder != nil { - r.Recorder.Event(agentCard, corev1.EventTypeWarning, "CardContentChanged", - fmt.Sprintf("Agent card content changed: previous=%s, current=%s", agentCard.Status.CardId, cardId)) - } - agentCardLogger.Info("Card content changed", "agentCard", agentCard.Name, "previousCardId", agentCard.Status.CardId, "newCardId", cardId) + if r.RequireSignature && eval.verificationResult != nil && eval.verificationResult.Verified { + r.maybeRestartForResign(ctx, agentCard, workload, eval.verificationResult) } - resolvedTargetRef := &agentv1alpha1.TargetRef{ - APIVersion: workload.APIVersion, - Kind: workload.Kind, - Name: workload.Name, - } + sigVerified := eval.verificationResult != nil && eval.verificationResult.Verified + eval.isMTLSVerified = fetchResult != nil && fetchResult.AgentSpiffeID != "" + // Binding evaluation (independent of verified status computation) var bindingPassed bool - var binding *bindingResult - var identityMatch *bool - sigVerified := verificationResult != nil && verificationResult.Verified if agentCard.Spec.IdentityBinding != nil { var verifiedSpiffeID string - if verificationResult != nil && verificationResult.Verified && verificationResult.SpiffeID != "" { - verifiedSpiffeID = verificationResult.SpiffeID + if eval.isMTLSVerified { + verifiedSpiffeID = fetchResult.AgentSpiffeID + } else if eval.verificationResult != nil && + eval.verificationResult.Verified && eval.verificationResult.SpiffeID != "" { + verifiedSpiffeID = eval.verificationResult.SpiffeID } - binding = r.computeBinding(agentCard, verifiedSpiffeID) - bindingPassed = binding != nil && binding.Bound - match := sigVerified && bindingPassed - identityMatch = &match - } - - var vr *signature.VerificationResult - if r.RequireSignature { - vr = verificationResult - } - if err := r.updateAgentCardStatus(ctx, agentCard, cardData, protocol, cardId, resolvedTargetRef, vr, binding, identityMatch); err != nil { - agentCardLogger.Error(err, "Failed to update AgentCard status") - return ctrl.Result{}, err - } - - // Signature must pass for the label. Binding must also pass when strict: true; - // when strict: false (default), binding failures are recorded in status only. - if r.RequireSignature { - isVerified := sigVerified + eval.binding = r.computeBinding(agentCard, verifiedSpiffeID) + bindingPassed = eval.binding != nil && eval.binding.Bound + match := (sigVerified || eval.isMTLSVerified) && bindingPassed + eval.identityMatch = &match + } + + // Compute final Verified status + if eval.isMTLSVerified { + eval.verifiedStatus = metav1.ConditionTrue + eval.verifiedReason = "mTLSVerified" + eval.verifiedMessage = fmt.Sprintf("Agent SPIFFE ID: %s", fetchResult.AgentSpiffeID) + eval.attestedSpiffeID = fetchResult.AgentSpiffeID + eval.isVerified = true + } else if r.RequireSignature { + eval.isVerified = sigVerified if agentCard.Spec.IdentityBinding != nil { - isVerified = isVerified && (bindingPassed || !agentCard.Spec.IdentityBinding.Strict) - } - if err := r.propagateSignatureLabel(ctx, agentCard.Name, workload, isVerified); err != nil { - agentCardLogger.Error(err, "Failed to propagate signature label to workload", - "workload", workload.Name, "verified", isVerified) - return ctrl.Result{}, err + eval.isVerified = sigVerified && (bindingPassed || !agentCard.Spec.IdentityBinding.Strict) } - - if verificationResult != nil && !verificationResult.Verified && !r.SignatureAuditMode { - agentCardLogger.Info("Signature verification failed, rejecting agent card", - "workload", workload.Name, - "details", verificationResult.Details) - return ctrl.Result{RequeueAfter: 1 * time.Minute}, nil + if eval.isVerified { + eval.verifiedStatus = metav1.ConditionTrue + eval.verifiedReason = "SignatureVerified" + eval.verifiedMessage = fmt.Sprintf("JWS signature valid (keyID=%s)", eval.verificationResult.KeyID) + } else { + eval.verifiedStatus = metav1.ConditionFalse + eval.verifiedReason = "NotVerified" + eval.verifiedMessage = "No trust mechanism verified this agent" + if eval.verificationResult != nil && !eval.verificationResult.Verified { + eval.verifiedReason = ReasonSignatureInvalid + eval.verifiedMessage = eval.verificationResult.Details + } } + } else if r.EnableVerifiedFetch && r.AuthenticatedFetcher != nil { + eval.verifiedStatus = metav1.ConditionFalse + eval.verifiedReason = "NotVerified" + eval.verifiedMessage = "No trust mechanism verified this agent" } - syncPeriod := r.getSyncPeriod(agentCard) - agentCardLogger.V(1).Info("Successfully synced agent card", "workload", workload.Name, "kind", workload.Kind, "nextSync", syncPeriod) - - return ctrl.Result{RequeueAfter: syncPeriod}, nil + return eval } func (r *AgentCardReconciler) getWorkload(ctx context.Context, agentCard *agentv1alpha1.AgentCard) (*WorkloadInfo, error) { @@ -574,6 +665,20 @@ func (r *AgentCardReconciler) getServicePort(service *corev1.Service) int32 { return 8000 } +// AgentTLSPortName is the named port convention for the agent's TLS listener. +const AgentTLSPortName = "agent-tls" + +// getAgentTLSPort returns the port number for the agent-tls named port. +// Returns 0 if the named port is not found. +func (r *AgentCardReconciler) getAgentTLSPort(service *corev1.Service) int32 { + for _, port := range service.Spec.Ports { + if port.Name == AgentTLSPortName { + return port.Port + } + } + return 0 +} + func (r *AgentCardReconciler) getSyncPeriod(agentCard *agentv1alpha1.AgentCard) time.Duration { if agentCard.Spec.SyncPeriod == "" { return DefaultSyncPeriod @@ -590,7 +695,24 @@ func (r *AgentCardReconciler) getSyncPeriod(agentCard *agentv1alpha1.AgentCard) } // updateAgentCardStatus persists all status fields atomically with retry. -func (r *AgentCardReconciler) updateAgentCardStatus(ctx context.Context, agentCard *agentv1alpha1.AgentCard, cardData *agentv1alpha1.AgentCardData, protocol, cardId string, targetRef *agentv1alpha1.TargetRef, verificationResult *signature.VerificationResult, binding *bindingResult, identityMatch *bool) error { +// +// Condition semantics: +// - Verified: final trust decision used by NetworkPolicy controller and labels. +// True when mTLS authenticated fetch succeeds OR JWS signature + binding passes. +// Absent when no trust mechanism is configured. +// - SignatureVerified: raw JWS cryptographic outcome (informational only, not used +// for enforcement). Reflects whether the x5c signature is mathematically valid. +// - Synced: whether the agent card was successfully fetched. +// - Ready: composite signal — True when Synced is True AND (Verified is True or absent). +// - Bound: whether identity binding constraints are satisfied. +func (r *AgentCardReconciler) updateAgentCardStatus( //nolint:gocyclo + ctx context.Context, agentCard *agentv1alpha1.AgentCard, + cardData *agentv1alpha1.AgentCardData, protocol, cardID string, + targetRef *agentv1alpha1.TargetRef, verificationResult *signature.VerificationResult, + binding *bindingResult, identityMatch *bool, mTLSVerified bool, + verifiedReason string, verifiedStatus metav1.ConditionStatus, + verifiedMessage string, attestedSpiffeID string, +) error { return retry.RetryOnConflict(retry.DefaultRetry, func() error { latest := &agentv1alpha1.AgentCard{} if err := r.Get(ctx, types.NamespacedName{ @@ -603,9 +725,9 @@ func (r *AgentCardReconciler) updateAgentCardStatus(ctx context.Context, agentCa latest.Status.Card = cardData latest.Status.Protocol = protocol latest.Status.TargetRef = targetRef - if cardId != "" && cardId != latest.Status.CardId { + if cardID != "" && cardID != latest.Status.CardId { latest.Status.LastSyncTime = &metav1.Time{Time: time.Now()} - latest.Status.CardId = cardId + latest.Status.CardId = cardID } else if latest.Status.LastSyncTime == nil { latest.Status.LastSyncTime = &metav1.Time{Time: time.Now()} } @@ -640,7 +762,7 @@ func (r *AgentCardReconciler) updateAgentCardStatus(ctx context.Context, agentCa meta.SetStatusCondition(&latest.Status.Conditions, sigCondition) } - if verificationResult != nil && !verificationResult.Verified && !r.SignatureAuditMode { + if verificationResult != nil && !verificationResult.Verified && !r.SignatureAuditMode && !mTLSVerified { meta.SetStatusCondition(&latest.Status.Conditions, metav1.Condition{ Type: "Synced", Status: metav1.ConditionFalse, @@ -651,6 +773,8 @@ func (r *AgentCardReconciler) updateAgentCardStatus(ctx context.Context, agentCa message := fmt.Sprintf("Successfully fetched agent card for %s", cardData.Name) if verificationResult != nil && !verificationResult.Verified && r.SignatureAuditMode { message = fmt.Sprintf("Fetched agent card for %s (signature verification failed but audit mode enabled)", cardData.Name) + } else if mTLSVerified && verificationResult != nil && !verificationResult.Verified { + message = fmt.Sprintf("Successfully fetched agent card for %s (mTLS verified, no JWS signature)", cardData.Name) } meta.SetStatusCondition(&latest.Status.Conditions, metav1.Condition{ Type: "Synced", @@ -660,25 +784,50 @@ func (r *AgentCardReconciler) updateAgentCardStatus(ctx context.Context, agentCa }) } + // Verified condition: set only when a trust mechanism is active (non-empty reason). + if verifiedReason != "" { + meta.SetStatusCondition(&latest.Status.Conditions, metav1.Condition{ + Type: ConditionVerified, + Status: verifiedStatus, + Reason: verifiedReason, + Message: verifiedMessage, + }) + } + + // AttestedAgentSpiffeID from mTLS + if attestedSpiffeID != "" { + latest.Status.AttestedAgentSpiffeID = attestedSpiffeID + } + + // Ready = Synced AND (Verified is True OR Verified was never evaluated) + var readyStatus metav1.ConditionStatus + var readyReason, readyMessage string + syncedCond := meta.FindStatusCondition(latest.Status.Conditions, "Synced") + verifiedCond := meta.FindStatusCondition(latest.Status.Conditions, ConditionVerified) + if syncedCond != nil && syncedCond.Status == metav1.ConditionTrue { + if verifiedCond == nil || verifiedCond.Status == metav1.ConditionTrue { + readyStatus = metav1.ConditionTrue + readyReason = "ReadyToServe" + readyMessage = "Agent index is ready for queries" + } else { + readyStatus = metav1.ConditionFalse + readyReason = "VerificationFailed" + readyMessage = "Agent synced but identity verification failed" + } + } else { + readyStatus = metav1.ConditionFalse + readyReason = "SyncFailed" + readyMessage = "Agent card fetch failed" + } meta.SetStatusCondition(&latest.Status.Conditions, metav1.Condition{ Type: "Ready", - Status: metav1.ConditionTrue, - Reason: "ReadyToServe", - Message: "Agent index is ready for queries", + Status: readyStatus, + Reason: readyReason, + Message: readyMessage, }) if binding != nil { existingBound := meta.FindStatusCondition(latest.Status.Conditions, "Bound") - - if existingBound == nil { - agentCardLogger.Info("Identity binding is allowlist-only; SPIFFE trust bundle verification not yet available", - "agentCard", latest.Name) - if r.Recorder != nil { - r.Recorder.Event(agentCard, corev1.EventTypeWarning, "AllowlistOnly", - "Identity binding is allowlist-only; SPIFFE trust bundle verification not yet available") - } - } - newConditionStatus := metav1.ConditionFalse if binding.Bound { newConditionStatus = metav1.ConditionTrue @@ -692,34 +841,7 @@ func (r *AgentCardReconciler) updateAgentCardStatus(ctx context.Context, agentCa } } } - - bindingChanged := latest.Status.BindingStatus == nil || - latest.Status.BindingStatus.Bound != binding.Bound || - latest.Status.BindingStatus.Reason != binding.Reason || - latest.Status.BindingStatus.Message != binding.Message - var evalTime *metav1.Time - if latest.Status.BindingStatus != nil { - evalTime = latest.Status.BindingStatus.LastEvaluationTime - } - if bindingChanged || evalTime == nil { - now := metav1.Now() - evalTime = &now - } - latest.Status.BindingStatus = &agentv1alpha1.BindingStatus{ - Bound: binding.Bound, - Reason: binding.Reason, - Message: binding.Message, - LastEvaluationTime: evalTime, - } - if binding.SpiffeID != "" { - latest.Status.ExpectedSpiffeID = binding.SpiffeID - } - meta.SetStatusCondition(&latest.Status.Conditions, metav1.Condition{ - Type: "Bound", - Status: newConditionStatus, - Reason: binding.Reason, - Message: binding.Message, - }) + applyBindingToStatus(latest, binding) } latest.Status.SignatureIdentityMatch = identityMatch @@ -904,6 +1026,61 @@ func (r *AgentCardReconciler) updateCondition(ctx context.Context, agentCard *ag return nil } +// ConditionVerified is the condition type for mTLS-verified fetch status. +const ConditionVerified = "Verified" + +// setVerifiedCondition sets the Verified condition on the AgentCard status. +func (r *AgentCardReconciler) setVerifiedCondition( + ctx context.Context, agentCard *agentv1alpha1.AgentCard, + status metav1.ConditionStatus, reason, message string, +) error { + return r.updateCondition(ctx, agentCard, ConditionVerified, status, reason, message) +} + +// propagateVerifiedLabel syncs the identity-verified label based on Verified status. +// Uses per-card annotation AND-aggregation (same pattern as propagateSignatureLabel). +func (r *AgentCardReconciler) propagateVerifiedLabel( + ctx context.Context, agentCard *agentv1alpha1.AgentCard, + workload *WorkloadInfo, verified bool, +) error { + if workload == nil { + return nil + } + return r.propagateSignatureLabel(ctx, agentCard.Name, workload, verified) +} + +// cleanupVerifiedFetchFields removes stale Phase 1 fields when verifiedFetch is disabled. +func (r *AgentCardReconciler) cleanupVerifiedFetchFields(ctx context.Context, agentCard *agentv1alpha1.AgentCard) { + latest := &agentv1alpha1.AgentCard{} + if err := r.Get(ctx, types.NamespacedName{ + Name: agentCard.Name, + Namespace: agentCard.Namespace, + }, latest); err != nil { + agentCardLogger.Error(err, "Failed to get AgentCard for cleanup", "agentCard", agentCard.Name) + return + } + + verifiedCond := meta.FindStatusCondition(latest.Status.Conditions, ConditionVerified) + needsUpdate := verifiedCond != nil || latest.Status.AttestedAgentSpiffeID != "" + if !needsUpdate { + return + } + + if err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + if err := r.Get(ctx, types.NamespacedName{ + Name: agentCard.Name, + Namespace: agentCard.Namespace, + }, latest); err != nil { + return err + } + meta.RemoveStatusCondition(&latest.Status.Conditions, ConditionVerified) + latest.Status.AttestedAgentSpiffeID = "" + return r.Status().Update(ctx, latest) + }); err != nil { + agentCardLogger.Error(err, "Failed to cleanup verified fetch fields", "agentCard", agentCard.Name) + } +} + func (r *AgentCardReconciler) handleDeletion(ctx context.Context, agentCard *agentv1alpha1.AgentCard) (ctrl.Result, error) { if controllerutil.ContainsFinalizer(agentCard, AgentCardFinalizer) { agentCardLogger.Info("Cleaning up AgentCard", "name", agentCard.Name) @@ -1053,7 +1230,7 @@ func (r *AgentCardReconciler) computeBinding(agentCard *agentv1alpha1.AgentCard, return &bindingResult{ Bound: false, Reason: ReasonNotBound, - Message: "No SPIFFE ID from x5c certificate chain: ensure the card is signed with a SPIRE-issued SVID", + Message: "No verified SPIFFE ID available: ensure the card is signed with a SPIRE-issued SVID or served over mTLS", } } @@ -1091,6 +1268,43 @@ func (r *AgentCardReconciler) computeBinding(agentCard *agentv1alpha1.AgentCard, return &bindingResult{Bound: bound, Reason: reason, Message: message, SpiffeID: verifiedSpiffeID} } +// applyBindingToStatus writes binding fields and the Bound condition onto an AgentCard status. +// Used by both the main status update path and the standalone error-path binding update. +func applyBindingToStatus(latest *agentv1alpha1.AgentCard, binding *bindingResult) { + bindingChanged := latest.Status.BindingStatus == nil || + latest.Status.BindingStatus.Bound != binding.Bound || + latest.Status.BindingStatus.Reason != binding.Reason || + latest.Status.BindingStatus.Message != binding.Message + var evalTime *metav1.Time + if latest.Status.BindingStatus != nil { + evalTime = latest.Status.BindingStatus.LastEvaluationTime + } + if bindingChanged || evalTime == nil { + now := metav1.Now() + evalTime = &now + } + latest.Status.BindingStatus = &agentv1alpha1.BindingStatus{ + Bound: binding.Bound, + Reason: binding.Reason, + Message: binding.Message, + LastEvaluationTime: evalTime, + } + if binding.SpiffeID != "" { + latest.Status.ExpectedSpiffeID = binding.SpiffeID + } + + conditionStatus := metav1.ConditionFalse + if binding.Bound { + conditionStatus = metav1.ConditionTrue + } + meta.SetStatusCondition(&latest.Status.Conditions, metav1.Condition{ + Type: "Bound", + Status: conditionStatus, + Reason: binding.Reason, + Message: binding.Message, + }) +} + // updateBindingStatus writes binding status when the main status path is unreachable. func (r *AgentCardReconciler) updateBindingStatus(ctx context.Context, agentCard *agentv1alpha1.AgentCard, bound bool, reason, message, expectedSpiffeID string) error { return retry.RetryOnConflict(retry.DefaultRetry, func() error { @@ -1102,45 +1316,19 @@ func (r *AgentCardReconciler) updateBindingStatus(ctx context.Context, agentCard return err } - bindingChanged := latest.Status.BindingStatus == nil || - latest.Status.BindingStatus.Bound != bound || - latest.Status.BindingStatus.Reason != reason || - latest.Status.BindingStatus.Message != message - var evalTime *metav1.Time - if latest.Status.BindingStatus != nil { - evalTime = latest.Status.BindingStatus.LastEvaluationTime - } - if bindingChanged || evalTime == nil { - now := metav1.Now() - evalTime = &now - } - latest.Status.BindingStatus = &agentv1alpha1.BindingStatus{ - Bound: bound, - Reason: reason, - Message: message, - LastEvaluationTime: evalTime, - } - if expectedSpiffeID != "" { - latest.Status.ExpectedSpiffeID = expectedSpiffeID - } - - conditionStatus := metav1.ConditionFalse - if bound { - conditionStatus = metav1.ConditionTrue - } - meta.SetStatusCondition(&latest.Status.Conditions, metav1.Condition{ - Type: "Bound", - Status: conditionStatus, - Reason: reason, - Message: message, + applyBindingToStatus(latest, &bindingResult{ + Bound: bound, + Reason: reason, + Message: message, + SpiffeID: expectedSpiffeID, }) return r.Status().Update(ctx, latest) }) } -// computeCardId returns a SHA-256 hash of the card data for drift detection. -func (r *AgentCardReconciler) computeCardId(cardData *agentv1alpha1.AgentCardData) string { +// computeCardID returns a SHA-256 hash of the card data for drift detection. +func (r *AgentCardReconciler) computeCardID(cardData *agentv1alpha1.AgentCardData) string { if cardData == nil { return "" } @@ -1157,14 +1345,9 @@ func (r *AgentCardReconciler) computeCardId(cardData *agentv1alpha1.AgentCardDat // 1. The leaf SVID cert is approaching expiry (within SVIDExpiryGracePeriod). // 2. The trust bundle hash changed since the workload was last (re)started. // -// When operator signing is enabled (EnableOperatorSign), the operator re-signs on each -// reconcile using its auto-rotating X509Source, so pod restarts for re-signing are -// unnecessary. In init-container mode, the pod template annotation is patched to trigger +// In init-container mode, the pod template annotation is patched to trigger // a rollout so the init-container re-runs and fetches a fresh SVID. func (r *AgentCardReconciler) maybeRestartForResign(ctx context.Context, agentCard *agentv1alpha1.AgentCard, workload *WorkloadInfo, vr *signature.VerificationResult) { - if r.EnableOperatorSign { - return - } if workload == nil || r.SignatureProvider == nil { return } diff --git a/kagenti-operator/internal/controller/agentcard_controller_test.go b/kagenti-operator/internal/controller/agentcard_controller_test.go index 4d0a0d87..db120deb 100644 --- a/kagenti-operator/internal/controller/agentcard_controller_test.go +++ b/kagenti-operator/internal/controller/agentcard_controller_test.go @@ -36,44 +36,6 @@ import ( "github.com/kagenti/operator/internal/agentcard" ) -// mockSigner implements signature.Signer for testing operator signing path. -type mockSigner struct { - mu sync.Mutex - signErr error - signedOut []byte - ready bool - spiffeID string - signCount int -} - -func (m *mockSigner) SignCard(_ context.Context, cardData *agentv1alpha1.AgentCardData) ([]byte, error) { - m.mu.Lock() - defer m.mu.Unlock() - m.signCount++ - if m.signErr != nil { - return nil, m.signErr - } - if m.signedOut != nil { - return m.signedOut, nil - } - out, _ := json.Marshal(cardData) - return out, nil -} - -func (m *mockSigner) SpiffeID() string { - m.mu.Lock() - defer m.mu.Unlock() - return m.spiffeID -} - -func (m *mockSigner) Ready() bool { - m.mu.Lock() - defer m.mu.Unlock() - return m.ready -} - -func (m *mockSigner) Close() error { return nil } - // mockFetcher implements agentcard.Fetcher for testing type mockFetcher struct { cardData *agentv1alpha1.AgentCardData @@ -1762,193 +1724,6 @@ var _ = Describe("AgentCard Controller - getSyncPeriod", func() { }) }) -var _ = Describe("AgentCard Operator Signing", func() { - const ( - opsDeployName = "opsign-agent" - opsCardName = "opsign-card" - opsSvcName = "opsign-agent" - namespace = "default" - ) - - ctx := context.Background() - nn := types.NamespacedName{Name: opsCardName, Namespace: namespace} - - BeforeEach(func() { - By("creating a Deployment with agent labels") - replicas := int32(1) - dep := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: opsDeployName, - Namespace: namespace, - Labels: map[string]string{ - LabelAgentType: LabelValueAgent, - ProtocolLabelPrefix + "a2a": "", - }, - }, - Spec: appsv1.DeploymentSpec{ - Replicas: &replicas, - Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": opsDeployName}}, - Template: corev1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{ - "app": opsDeployName, - LabelAgentType: LabelValueAgent, - ProtocolLabelPrefix + "a2a": "", - }}, - Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: "agent", Image: "img:latest"}}}, - }, - }, - } - Expect(k8sClient.Create(ctx, dep)).To(Succeed()) - - By("setting Deployment status to Available") - Eventually(func() error { - if err := k8sClient.Get(ctx, types.NamespacedName{Name: opsDeployName, Namespace: namespace}, dep); err != nil { - return err - } - dep.Status.Conditions = []appsv1.DeploymentCondition{ - {Type: appsv1.DeploymentAvailable, Status: corev1.ConditionTrue}, - } - return k8sClient.Status().Update(ctx, dep) - }).Should(Succeed()) - - By("creating a Service for the Deployment") - svc := &corev1.Service{ - ObjectMeta: metav1.ObjectMeta{Name: opsSvcName, Namespace: namespace}, - Spec: corev1.ServiceSpec{Ports: []corev1.ServicePort{{Port: 8000}}}, - } - Expect(k8sClient.Create(ctx, svc)).To(Succeed()) - - By("creating an AgentCard with targetRef") - card := &agentv1alpha1.AgentCard{ - ObjectMeta: metav1.ObjectMeta{Name: opsCardName, Namespace: namespace}, - Spec: agentv1alpha1.AgentCardSpec{ - SyncPeriod: "30s", - TargetRef: &agentv1alpha1.TargetRef{APIVersion: "apps/v1", Kind: "Deployment", Name: opsDeployName}, - }, - } - Expect(k8sClient.Create(ctx, card)).To(Succeed()) - }) - - AfterEach(func() { - By("cleaning up resources") - ac := &agentv1alpha1.AgentCard{} - if err := k8sClient.Get(ctx, nn, ac); err == nil { - ac.Finalizers = nil - _ = k8sClient.Update(ctx, ac) - _ = k8sClient.Delete(ctx, ac) - } - dep := &appsv1.Deployment{} - if err := k8sClient.Get(ctx, types.NamespacedName{Name: opsDeployName, Namespace: namespace}, dep); err == nil { - Expect(k8sClient.Delete(ctx, dep)).To(Succeed()) - } - svc := &corev1.Service{} - if err := k8sClient.Get(ctx, types.NamespacedName{Name: opsSvcName, Namespace: namespace}, svc); err == nil { - Expect(k8sClient.Delete(ctx, svc)).To(Succeed()) - } - }) - - It("should skip signing when operator signing disabled", func() { - signer := &mockSigner{ready: true, spiffeID: "spiffe://example.org/operator"} - - reconciler := &AgentCardReconciler{ - Client: k8sClient, - Scheme: k8sClient.Scheme(), - AgentFetcher: &mockFetcher{cardData: &agentv1alpha1.AgentCardData{Name: "A", Version: "1", URL: "http://x"}}, - EnableOperatorSign: false, - CardSigner: signer, - } - - _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: nn}) - Expect(err).NotTo(HaveOccurred()) - - signer.mu.Lock() - count := signer.signCount - signer.mu.Unlock() - Expect(count).To(Equal(0), "signer should not be called when operator signing disabled") - }) - - It("should sign card when operator signing enabled", func() { - signer := &mockSigner{ - ready: true, - spiffeID: "spiffe://example.org/kagenti-operator", - } - - reconciler := &AgentCardReconciler{ - Client: k8sClient, - Scheme: k8sClient.Scheme(), - AgentFetcher: &mockFetcher{cardData: &agentv1alpha1.AgentCardData{Name: "OpSign", Version: "1", URL: "http://x"}}, - EnableOperatorSign: true, - CardSigner: signer, - } - - _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: nn}) - Expect(err).NotTo(HaveOccurred()) - _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: nn}) - Expect(err).NotTo(HaveOccurred()) - - signer.mu.Lock() - count := signer.signCount - signer.mu.Unlock() - Expect(count).To(Equal(1), "signer should be called once") - }) - - It("should requeue when signer not ready", func() { - signer := &mockSigner{ - ready: false, - spiffeID: "", - } - - reconciler := &AgentCardReconciler{ - Client: k8sClient, - Scheme: k8sClient.Scheme(), - AgentFetcher: &mockFetcher{cardData: &agentv1alpha1.AgentCardData{Name: "A", Version: "1", URL: "http://x"}}, - EnableOperatorSign: true, - CardSigner: signer, - } - - _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: nn}) - Expect(err).NotTo(HaveOccurred()) - result, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: nn}) - Expect(err).NotTo(HaveOccurred()) - Expect(result.RequeueAfter).To(Equal(10 * time.Second)) - - card := &agentv1alpha1.AgentCard{} - Expect(k8sClient.Get(ctx, nn, card)).To(Succeed()) - syncCond := findCondition(card.Status.Conditions, "Synced") - Expect(syncCond).NotTo(BeNil()) - Expect(syncCond.Reason).To(Equal("SignerNotReady")) - }) - - It("should handle signing failure gracefully", func() { - signer := &mockSigner{ - ready: true, - signErr: errors.New("SVID expired"), - spiffeID: "spiffe://example.org/operator", - } - - reconciler := &AgentCardReconciler{ - Client: k8sClient, - Scheme: k8sClient.Scheme(), - AgentFetcher: &mockFetcher{cardData: &agentv1alpha1.AgentCardData{Name: "A", Version: "1", URL: "http://x"}}, - EnableOperatorSign: true, - CardSigner: signer, - } - - _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: nn}) - Expect(err).NotTo(HaveOccurred()) - result, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: nn}) - Expect(err).NotTo(HaveOccurred()) - Expect(result.RequeueAfter).To(Equal(30 * time.Second)) - - card := &agentv1alpha1.AgentCard{} - Expect(k8sClient.Get(ctx, nn, card)).To(Succeed()) - syncCond := findCondition(card.Status.Conditions, "Synced") - Expect(syncCond).NotTo(BeNil()) - Expect(syncCond.Reason).To(Equal("SigningFailed")) - }) - -}) - // Helper function to find a condition by type func findCondition(conditions []metav1.Condition, conditionType string) *metav1.Condition { for i := range conditions { diff --git a/kagenti-operator/internal/controller/identity_binding_test.go b/kagenti-operator/internal/controller/identity_binding_test.go index 063ebbca..dc52dd97 100644 --- a/kagenti-operator/internal/controller/identity_binding_test.go +++ b/kagenti-operator/internal/controller/identity_binding_test.go @@ -178,11 +178,11 @@ var _ = Describe("Identity Binding — Trust Domain Only", func() { URL: "http://localhost:8000", } - cardId1 := reconciler.computeCardId(cardData) - cardId2 := reconciler.computeCardId(cardData) + cardID1 := reconciler.computeCardID(cardData) + cardID2 := reconciler.computeCardID(cardData) - Expect(cardId1).NotTo(BeEmpty()) - Expect(cardId1).To(Equal(cardId2)) + Expect(cardID1).NotTo(BeEmpty()) + Expect(cardID1).To(Equal(cardID2)) }) It("should compute different card ID for different card data", func() { @@ -201,12 +201,12 @@ var _ = Describe("Identity Binding — Trust Domain Only", func() { Version: "2.0.0", } - cardId1 := reconciler.computeCardId(cardData1) - cardId2 := reconciler.computeCardId(cardData2) + cardID1 := reconciler.computeCardID(cardData1) + cardID2 := reconciler.computeCardID(cardData2) - Expect(cardId1).NotTo(BeEmpty()) - Expect(cardId2).NotTo(BeEmpty()) - Expect(cardId1).NotTo(Equal(cardId2)) + Expect(cardID1).NotTo(BeEmpty()) + Expect(cardID2).NotTo(BeEmpty()) + Expect(cardID1).NotTo(Equal(cardID2)) }) }) }) From 6d94ff6480bb77ac0b698879b7b21c051f3b610a Mon Sep 17 00:00:00 2001 From: Kevin Cogan Date: Thu, 7 May 2026 22:38:45 +0100 Subject: [PATCH 11/16] feat: add Helm chart support for verified fetch and fix NP ingress - Add verifiedFetch Helm values with SPIFFE CSI volume and operator flags - Create ClusterSPIFFEID template for operator workload identity - Inject POD_NAMESPACE via Downward API for NetworkPolicy peer selection - Use kubernetes.io/metadata.name label instead of custom labeling - Replace hardcoded namespace with ClusterDefaultsNamespace constant Signed-off-by: Kevin Cogan --- .../templates/manager/clusterspiffeid.yaml | 2 +- .../templates/manager/manager.yaml | 25 +-- charts/kagenti-operator/values.yaml | 15 +- .../agentcard_networkpolicy_controller.go | 49 +++--- ...agentcard_networkpolicy_controller_test.go | 148 ++++++++++++++++++ 5 files changed, 202 insertions(+), 37 deletions(-) diff --git a/charts/kagenti-operator/templates/manager/clusterspiffeid.yaml b/charts/kagenti-operator/templates/manager/clusterspiffeid.yaml index c4e38e76..2b327b98 100644 --- a/charts/kagenti-operator/templates/manager/clusterspiffeid.yaml +++ b/charts/kagenti-operator/templates/manager/clusterspiffeid.yaml @@ -1,4 +1,4 @@ -{{- if .Values.operatorSigning.enabled }} +{{- if .Values.verifiedFetch.enabled }} apiVersion: spire.spiffe.io/v1alpha1 kind: ClusterSPIFFEID metadata: diff --git a/charts/kagenti-operator/templates/manager/manager.yaml b/charts/kagenti-operator/templates/manager/manager.yaml index f9483868..e881c027 100644 --- a/charts/kagenti-operator/templates/manager/manager.yaml +++ b/charts/kagenti-operator/templates/manager/manager.yaml @@ -34,18 +34,21 @@ spec: {{- if .Values.mlflow.enable }} - "--enable-mlflow=true" {{- end }} - {{- if .Values.operatorSigning.enabled }} - - "--enable-operator-signing=true" - - "--spiffe-endpoint-socket={{ .Values.operatorSigning.spiffeEndpointSocket }}" + {{- if .Values.verifiedFetch.enabled }} + - "--enable-verified-fetch=true" + - "--verified-fetch-spiffe-socket={{ .Values.verifiedFetch.spiffeEndpointSocket }}" + {{- if and .Values.verifiedFetch.spireTrustDomain (not .Values.signatureVerification.enabled) }} + - "--spire-trust-domain={{ .Values.verifiedFetch.spireTrustDomain }}" + {{- end }} + {{- end }} + {{- if or .Values.enforceNetworkPolicies .Values.signatureVerification.enforceNetworkPolicies }} + - "--enforce-network-policies=true" {{- end }} {{- if .Values.signatureVerification.enabled }} - "--require-a2a-signature=true" {{- if .Values.signatureVerification.auditMode }} - "--signature-audit-mode=true" {{- end }} - {{- if .Values.signatureVerification.enforceNetworkPolicies }} - - "--enforce-network-policies=true" - {{- end }} {{- if .Values.signatureVerification.spireTrustDomain }} - "--spire-trust-domain={{ .Values.signatureVerification.spireTrustDomain }}" {{- end }} @@ -69,13 +72,15 @@ spec: - {{ .Values.controllerManager.container.cmd }} image: {{ .Values.controllerManager.container.image.repository }}:{{ .Values.controllerManager.container.image.tag }} imagePullPolicy: {{ .Values.controllerManager.container.image.pullPolicy | default "IfNotPresent" }} - {{- if .Values.controllerManager.container.env }} env: + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace {{- range $key, $value := .Values.controllerManager.container.env }} - name: {{ $key }} value: {{ $value | quote }} {{- end }} - {{- end }} livenessProbe: {{- toYaml .Values.controllerManager.container.livenessProbe | nindent 12 }} readinessProbe: @@ -109,7 +114,7 @@ spec: mountPath: /tmp/k8s-metrics-server/metrics-certs readOnly: true {{- end }} - {{- if .Values.operatorSigning.enabled }} + {{- if .Values.verifiedFetch.enabled }} - name: spiffe-workload-api mountPath: /spiffe-workload-api readOnly: true @@ -137,7 +142,7 @@ spec: secret: secretName: kagenti-operator-metrics-server-cert {{- end }} - {{- if .Values.operatorSigning.enabled }} + {{- if .Values.verifiedFetch.enabled }} - name: spiffe-workload-api csi: driver: "csi.spiffe.io" diff --git a/charts/kagenti-operator/values.yaml b/charts/kagenti-operator/values.yaml index f85aa7fe..681e0943 100644 --- a/charts/kagenti-operator/values.yaml +++ b/charts/kagenti-operator/values.yaml @@ -88,15 +88,24 @@ certmanager: networkPolicy: enable: false +# Enforce NetworkPolicies based on AgentCard verification status. +# Works with both verifiedFetch (mTLS) and signatureVerification (init-signer). +enforceNetworkPolicies: false + # [MLFLOW]: MLflow experiment tracking integration mlflow: enable: false -# [OPERATOR SIGNING]: Operator signs agent cards with its own SPIFFE identity -# Replaces the init-container self-signing model (Phase 2A of Sigstore RFC) -operatorSigning: +# [VERIFIED FETCH]: mTLS-authenticated fetch of agent cards via SPIFFE identity (Phase 1) +# When enabled, the operator uses go-spiffe mTLS to fetch agent cards and records +# the agent's attested SPIFFE ID in CRD status. +verifiedFetch: enabled: false spiffeEndpointSocket: "unix:///spiffe-workload-api/spire-agent.sock" + # Agents must expose a Service port named "agent-tls" (default 8443). + # The operator discovers the TLS endpoint by port name, not number. + # SPIRE trust domain for identity binding (required when enabled) + spireTrustDomain: "" # [SIGNATURE VERIFICATION]: A2A agent card signature verification via SPIRE x5c signatureVerification: diff --git a/kagenti-operator/internal/controller/agentcard_networkpolicy_controller.go b/kagenti-operator/internal/controller/agentcard_networkpolicy_controller.go index 32f1464c..6a281ce4 100644 --- a/kagenti-operator/internal/controller/agentcard_networkpolicy_controller.go +++ b/kagenti-operator/internal/controller/agentcard_networkpolicy_controller.go @@ -24,6 +24,7 @@ import ( corev1 "k8s.io/api/core/v1" netv1 "k8s.io/api/networking/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -42,11 +43,15 @@ const NetworkPolicyFinalizer = "agentcard.kagenti.dev/network-policy" var networkPolicyLogger = ctrl.Log.WithName("controller").WithName("AgentCardNetworkPolicy") -// AgentCardNetworkPolicyReconciler manages NetworkPolicies based on AgentCard signature status. +// AgentCardNetworkPolicyReconciler manages NetworkPolicies based on AgentCard verification status. type AgentCardNetworkPolicyReconciler struct { client.Client Scheme *runtime.Scheme EnforceNetworkPolicies bool + // OperatorNamespace is the namespace the operator runs in. Used to allow + // ingress from the operator in generated NetworkPolicies via the built-in + // kubernetes.io/metadata.name label (no manual namespace labeling required). + OperatorNamespace string // KubeAPIServerCIDRs are the /32 CIDRs of the K8s API server endpoints. // Populated at startup from the "kubernetes" Endpoints in the default namespace. // Used to allow workload egress to the API server in restrictive policies. @@ -141,13 +146,10 @@ func (r *AgentCardNetworkPolicyReconciler) manageNetworkPolicy(ctx context.Conte policyName := fmt.Sprintf("%s-signature-policy", workloadName) isVerified := false - if agentCard.Spec.IdentityBinding != nil { - isVerified = agentCard.Status.SignatureIdentityMatch != nil && *agentCard.Status.SignatureIdentityMatch - if !isVerified && !agentCard.Spec.IdentityBinding.Strict { - isVerified = true - } - } else { - isVerified = agentCard.Status.ValidSignature != nil && *agentCard.Status.ValidSignature + + verifiedCond := meta.FindStatusCondition(agentCard.Status.Conditions, ConditionVerified) + if verifiedCond != nil { + isVerified = verifiedCond.Status == metav1.ConditionTrue } if isVerified { @@ -164,7 +166,7 @@ func (r *AgentCardNetworkPolicyReconciler) upsertNetworkPolicy(ctx context.Conte Labels: map[string]string{ "managed-by": "kagenti-operator", "kagenti.dev/agentcard": agentCard.Name, - "kagenti.dev/policy-type": "signature-verification", + "kagenti.dev/policy-type": "identity-verification", }, }, Spec: spec, @@ -196,7 +198,7 @@ func (r *AgentCardNetworkPolicyReconciler) createPermissivePolicy( ctx context.Context, policyName string, agentCard *agentv1alpha1.AgentCard, podSelectorLabels map[string]string, ) error { - ingressRule := operatorIngressRule() + ingressRule := r.operatorIngressRule() ingressRule.From = append(ingressRule.From, netv1.NetworkPolicyPeer{ PodSelector: &metav1.LabelSelector{ MatchLabels: map[string]string{LabelSignatureVerified: "true"}, @@ -213,21 +215,22 @@ func (r *AgentCardNetworkPolicyReconciler) createPermissivePolicy( return r.upsertNetworkPolicy(ctx, policyName, agentCard, spec) } -func operatorIngressRule() netv1.NetworkPolicyIngressRule { - return netv1.NetworkPolicyIngressRule{ - From: []netv1.NetworkPolicyPeer{ - { - NamespaceSelector: &metav1.LabelSelector{ - MatchLabels: map[string]string{"control-plane": "kagenti-operator"}, - }, - }, - { - NamespaceSelector: &metav1.LabelSelector{ - MatchLabels: map[string]string{"name": "kagenti-system"}, - }, +func (r *AgentCardNetworkPolicyReconciler) operatorIngressRule() netv1.NetworkPolicyIngressRule { + peers := []netv1.NetworkPolicyPeer{ + { + NamespaceSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"kubernetes.io/metadata.name": r.OperatorNamespace}, }, }, } + if r.OperatorNamespace != ClusterDefaultsNamespace { + peers = append(peers, netv1.NetworkPolicyPeer{ + NamespaceSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"kubernetes.io/metadata.name": ClusterDefaultsNamespace}, + }, + }) + } + return netv1.NetworkPolicyIngressRule{From: peers} } func (r *AgentCardNetworkPolicyReconciler) createRestrictivePolicy( @@ -238,7 +241,7 @@ func (r *AgentCardNetworkPolicyReconciler) createRestrictivePolicy( spec := netv1.NetworkPolicySpec{ PodSelector: metav1.LabelSelector{MatchLabels: podSelectorLabels}, PolicyTypes: []netv1.PolicyType{netv1.PolicyTypeIngress, netv1.PolicyTypeEgress}, - Ingress: []netv1.NetworkPolicyIngressRule{operatorIngressRule()}, + Ingress: []netv1.NetworkPolicyIngressRule{r.operatorIngressRule()}, Egress: r.kubeAPIEgressRules(), } return r.upsertNetworkPolicy(ctx, policyName, agentCard, spec) diff --git a/kagenti-operator/internal/controller/agentcard_networkpolicy_controller_test.go b/kagenti-operator/internal/controller/agentcard_networkpolicy_controller_test.go index e2bbd5b5..6b91e357 100644 --- a/kagenti-operator/internal/controller/agentcard_networkpolicy_controller_test.go +++ b/kagenti-operator/internal/controller/agentcard_networkpolicy_controller_test.go @@ -39,6 +39,7 @@ func newNPReconciler(enforce bool) *AgentCardNetworkPolicyReconciler { Client: k8sClient, Scheme: k8sClient.Scheme(), EnforceNetworkPolicies: enforce, + OperatorNamespace: "kagenti-operator-system", KubeAPIServerCIDRs: []string{"10.0.0.1/32", "10.0.0.2/32"}, } } @@ -71,6 +72,33 @@ func createCardWithStatus(name, ns, deploymentName string, validSig *bool, ident if validSig != nil || identityMatch != nil { card.Status.ValidSignature = validSig card.Status.SignatureIdentityMatch = identityMatch + + verified := false + if validSig != nil && *validSig { + verified = true + } + if binding != nil && identityMatch != nil { + if *identityMatch { + verified = true + } else if binding.Strict { + verified = false + } else { + verified = true + } + } + verifiedStatus := metav1.ConditionFalse + reason := "SignatureInvalid" + if verified { + verifiedStatus = metav1.ConditionTrue + reason = "SignatureVerified" + } + card.Status.Conditions = []metav1.Condition{{ + Type: ConditionVerified, + Status: verifiedStatus, + Reason: reason, + Message: "test", + LastTransitionTime: metav1.Now(), + }} ExpectWithOffset(1, k8sClient.Status().Update(ctx, card)).To(Succeed()) } } @@ -276,6 +304,12 @@ var _ = Describe("AgentCardNetworkPolicyReconciler", func() { card := &agentv1alpha1.AgentCard{} Expect(k8sClient.Get(ctx, types.NamespacedName{Name: agentCardName, Namespace: namespace}, card)).To(Succeed()) card.Status.ValidSignature = ptr.To(false) + for i := range card.Status.Conditions { + if card.Status.Conditions[i].Type == ConditionVerified { + card.Status.Conditions[i].Status = metav1.ConditionFalse + card.Status.Conditions[i].Reason = "SignatureInvalid" + } + } Expect(k8sClient.Status().Update(ctx, card)).To(Succeed()) reconcileNP(r, agentCardName, namespace) @@ -348,4 +382,118 @@ var _ = Describe("AgentCardNetworkPolicyReconciler", func() { Expect(err).To(HaveOccurred()) }) }) + + Context("Verified condition (Phase 1 mTLS path)", func() { + const ( + deploymentName = "np-verified-cond-agent" + agentCardName = "np-verified-cond-card" + namespace = "default" + ) + + AfterEach(func() { + cleanupResource(ctx, &agentv1alpha1.AgentCard{}, agentCardName, namespace) + cleanupResource(ctx, &appsv1.Deployment{}, deploymentName, namespace) + cleanupResource(ctx, &corev1.Service{}, deploymentName, namespace) + }) + + It("should create permissive policy when Verified condition is True", func() { + createDeploymentWithService(ctx, deploymentName, namespace) + createCardWithVerifiedCondition(agentCardName, namespace, deploymentName, metav1.ConditionTrue) + + r := newNPReconciler(true) + reconcileNP(r, agentCardName, namespace) + reconcileNP(r, agentCardName, namespace) + + p := getPolicy(deploymentName, namespace) + Expect(p.Spec.Ingress[0].From).To(HaveLen(3)) + Expect(policyHasVerifiedPodIngress(p)).To(BeTrue()) + }) + + It("should create restrictive policy when Verified condition is False", func() { + createDeploymentWithService(ctx, deploymentName, namespace) + createCardWithVerifiedCondition(agentCardName, namespace, deploymentName, metav1.ConditionFalse) + + r := newNPReconciler(true) + reconcileNP(r, agentCardName, namespace) + reconcileNP(r, agentCardName, namespace) + + p := getPolicy(deploymentName, namespace) + Expect(p.Spec.Ingress[0].From).To(HaveLen(2)) + Expect(p.Spec.Ingress[0].From[0].PodSelector).To(BeNil()) + }) + + It("should be permissive when Verified condition is True (set via signature)", func() { + createDeploymentWithService(ctx, deploymentName, namespace) + createCardWithStatus(agentCardName, namespace, deploymentName, ptr.To(true), nil, nil) + + r := newNPReconciler(true) + reconcileNP(r, agentCardName, namespace) + reconcileNP(r, agentCardName, namespace) + + p := getPolicy(deploymentName, namespace) + Expect(p.Spec.Ingress[0].From).To(HaveLen(3)) + Expect(policyHasVerifiedPodIngress(p)).To(BeTrue()) + }) + + It("Verified condition takes precedence over ValidSignature", func() { + createDeploymentWithService(ctx, deploymentName, namespace) + + card := &agentv1alpha1.AgentCard{ + ObjectMeta: metav1.ObjectMeta{Name: agentCardName, Namespace: namespace}, + Spec: agentv1alpha1.AgentCardSpec{ + TargetRef: &agentv1alpha1.TargetRef{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: deploymentName, + }, + }, + } + Expect(k8sClient.Create(ctx, card)).To(Succeed()) + + card.Status.ValidSignature = ptr.To(true) + card.Status.Conditions = []metav1.Condition{ + { + Type: ConditionVerified, + Status: metav1.ConditionFalse, + Reason: "FetchFailed", + Message: "mTLS fetch failed", + LastTransitionTime: metav1.Now(), + }, + } + Expect(k8sClient.Status().Update(ctx, card)).To(Succeed()) + + r := newNPReconciler(true) + reconcileNP(r, agentCardName, namespace) + reconcileNP(r, agentCardName, namespace) + + p := getPolicy(deploymentName, namespace) + Expect(p.Spec.Ingress[0].From).To(HaveLen(2)) + Expect(p.Spec.Ingress[0].From[0].PodSelector).To(BeNil()) + }) + }) }) + +func createCardWithVerifiedCondition(name, ns, deploymentName string, condStatus metav1.ConditionStatus) { + card := &agentv1alpha1.AgentCard{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + Spec: agentv1alpha1.AgentCardSpec{ + TargetRef: &agentv1alpha1.TargetRef{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: deploymentName, + }, + }, + } + ExpectWithOffset(1, k8sClient.Create(ctx, card)).To(Succeed()) + + card.Status.Conditions = []metav1.Condition{ + { + Type: ConditionVerified, + Status: condStatus, + Reason: "mTLSVerified", + Message: "test condition", + LastTransitionTime: metav1.Now(), + }, + } + ExpectWithOffset(1, k8sClient.Status().Update(ctx, card)).To(Succeed()) +} From d9f299a7e8072577278ba07359a890f365601626 Mon Sep 17 00:00:00 2001 From: Kevin Cogan Date: Thu, 7 May 2026 22:39:11 +0100 Subject: [PATCH 12/16] feat: add test-tls-agent binary for mTLS E2E verification Lightweight test server that serves AgentCard data over mTLS using SPIFFE workload identity. Used for E2E testing of verified fetch on clusters with SPIRE deployed. Signed-off-by: Kevin Cogan --- .../cmd/test-tls-agent/Dockerfile | 21 +++ kagenti-operator/cmd/test-tls-agent/main.go | 175 ++++++++++++++++++ 2 files changed, 196 insertions(+) create mode 100644 kagenti-operator/cmd/test-tls-agent/Dockerfile create mode 100644 kagenti-operator/cmd/test-tls-agent/main.go diff --git a/kagenti-operator/cmd/test-tls-agent/Dockerfile b/kagenti-operator/cmd/test-tls-agent/Dockerfile new file mode 100644 index 00000000..b2bd0195 --- /dev/null +++ b/kagenti-operator/cmd/test-tls-agent/Dockerfile @@ -0,0 +1,21 @@ +FROM docker.io/golang:1.26 AS builder +ARG TARGETOS +ARG TARGETARCH + +WORKDIR /workspace +COPY go.mod go.mod +COPY go.sum go.sum +RUN go mod download + +COPY cmd/test-tls-agent/ cmd/test-tls-agent/ +COPY api/ api/ +COPY internal/ internal/ + +RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o test-tls-agent ./cmd/test-tls-agent/ + +FROM gcr.io/distroless/static:nonroot +WORKDIR / +COPY --from=builder /workspace/test-tls-agent . +USER 65532:65532 + +ENTRYPOINT ["/test-tls-agent"] diff --git a/kagenti-operator/cmd/test-tls-agent/main.go b/kagenti-operator/cmd/test-tls-agent/main.go new file mode 100644 index 00000000..68ea1c64 --- /dev/null +++ b/kagenti-operator/cmd/test-tls-agent/main.go @@ -0,0 +1,175 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "context" + "crypto/tls" + "encoding/json" + "fmt" + "log" + "net/http" + "os" + "os/signal" + "sync" + "syscall" + "time" + + "github.com/spiffe/go-spiffe/v2/spiffetls/tlsconfig" + "github.com/spiffe/go-spiffe/v2/workloadapi" +) + +const ( + defaultSocket = "unix:///run/spire/sockets/agent.sock" + defaultTLSPort = "8443" + defaultHTTPPort = "8080" +) + +func main() { + socketPath := envOrDefault("SPIFFE_ENDPOINT_SOCKET", defaultSocket) + tlsPort := envOrDefault("TLS_PORT", defaultTLSPort) + httpPort := envOrDefault("HTTP_PORT", defaultHTTPPort) + agentName := envOrDefault("AGENT_NAME", "tls-test-agent") + podNamespace := envOrDefault("POD_NAMESPACE", "default") + + cardJSON := buildAgentCard(agentName, podNamespace) + + mux := http.NewServeMux() + mux.HandleFunc("/.well-known/agent-card.json", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.Write(cardJSON) + }) + mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write([]byte("ok")) + }) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sig := make(chan os.Signal, 1) + signal.Notify(sig, syscall.SIGTERM, syscall.SIGINT) + + var wg sync.WaitGroup + + // HTTP server (plain, for fallback testing) + httpServer := &http.Server{ + Addr: ":" + httpPort, + Handler: mux, + ReadHeaderTimeout: 5 * time.Second, + } + wg.Add(1) + go func() { + defer wg.Done() + log.Printf("Starting HTTP server on :%s", httpPort) + if err := httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.Fatalf("HTTP server error: %v", err) + } + }() + + // TLS server using SPIFFE SVID + x509Source, err := workloadapi.NewX509Source( + ctx, + workloadapi.WithClientOptions(workloadapi.WithAddr(socketPath)), + ) + if err != nil { + log.Fatalf("Failed to create X509Source: %v", err) + } + defer x509Source.Close() + + svid, err := x509Source.GetX509SVID() + if err != nil { + log.Fatalf("Failed to get initial SVID: %v", err) + } + log.Printf("Got SVID: %s", svid.ID) + + tlsCfg := tlsconfig.MTLSServerConfig(x509Source, x509Source, tlsconfig.AuthorizeAny()) + + tlsServer := &http.Server{ + Addr: ":" + tlsPort, + Handler: mux, + ReadHeaderTimeout: 5 * time.Second, + TLSConfig: tlsCfg, + } + wg.Add(1) + go func() { + defer wg.Done() + log.Printf("Starting TLS server on :%s (SPIFFE mTLS)", tlsPort) + ln, err := tls.Listen("tcp", ":"+tlsPort, tlsCfg) + if err != nil { + log.Fatalf("TLS listen error: %v", err) + } + if err := tlsServer.Serve(ln); err != nil && err != http.ErrServerClosed { + log.Fatalf("TLS server error: %v", err) + } + }() + + <-sig + log.Println("Shutting down...") + cancel() + + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second) + defer shutdownCancel() + httpServer.Shutdown(shutdownCtx) + tlsServer.Shutdown(shutdownCtx) + wg.Wait() +} + +func buildAgentCard(agentName, namespace string) []byte { + cardPath := envOrDefault("AGENT_CARD_PATH", "") + if cardPath != "" { + data, err := os.ReadFile(cardPath) + if err == nil { + return data + } + log.Printf("Warning: could not read %s: %v, using default card", cardPath, err) + } + + card := map[string]interface{}{ + "name": fmt.Sprintf("%s", agentName), + "description": "TLS test agent for Phase 1 verified fetch testing", + "url": fmt.Sprintf("https://%s.%s.svc.cluster.local:8443", agentName, namespace), + "version": "1.0.0", + "capabilities": map[string]interface{}{ + "streaming": false, + "pushNotifications": false, + }, + "defaultInputModes": []string{"text/plain"}, + "defaultOutputModes": []string{"text/plain"}, + "skills": []map[string]interface{}{ + { + "name": "echo", + "description": "Echoes input back (test skill)", + "inputModes": []string{"text/plain"}, + "outputModes": []string{"text/plain"}, + }, + }, + } + + data, err := json.MarshalIndent(card, "", " ") + if err != nil { + log.Fatalf("Failed to marshal agent card: %v", err) + } + return data +} + +func envOrDefault(key, defaultVal string) string { + if v := os.Getenv(key); v != "" { + return v + } + return defaultVal +} From 9c5b36ad7fdcbf671ec9f7ac4235c8f8a81da273 Mon Sep 17 00:00:00 2001 From: Kevin Cogan Date: Thu, 7 May 2026 22:39:29 +0100 Subject: [PATCH 13/16] chore: update Envoy sidecar template for agent-tls port exposure Ensures the injected Envoy sidecar exposes the agent-tls named port so the operator can discover mTLS endpoints by port name. Signed-off-by: Kevin Cogan --- .../internal/webhook/injector/envoy.yaml.tmpl | 43 +++++++++++++++++++ .../webhook/injector/envoy_template.go | 4 ++ 2 files changed, 47 insertions(+) diff --git a/kagenti-operator/internal/webhook/injector/envoy.yaml.tmpl b/kagenti-operator/internal/webhook/injector/envoy.yaml.tmpl index 048e905a..c6f0101b 100644 --- a/kagenti-operator/internal/webhook/injector/envoy.yaml.tmpl +++ b/kagenti-operator/internal/webhook/injector/envoy.yaml.tmpl @@ -102,6 +102,49 @@ static_resources: typed_config: "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router +{{- if .SpireEnabled }} + - name: agent_tls_listener + address: + socket_address: + protocol: TCP + address: 0.0.0.0 + port_value: {{ .TLSPort }} + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: agent_tls + codec_type: AUTO + route_config: + name: agent_tls_routes + virtual_hosts: + - name: local_app_tls + domains: ["*"] + routes: + - match: + prefix: "/" + route: + cluster: original_destination + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + transport_socket: + name: envoy.transport_sockets.tls + typed_config: + "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.DownstreamTlsContext + common_tls_context: + tls_certificates: + - certificate_chain: + filename: /opt/svid.pem + private_key: + filename: /opt/svid_key.pem + validation_context: + trusted_ca: + filename: /opt/svid_bundle.pem +{{- end }} + clusters: - name: original_destination connect_timeout: 30s diff --git a/kagenti-operator/internal/webhook/injector/envoy_template.go b/kagenti-operator/internal/webhook/injector/envoy_template.go index 451440f5..bbf59ede 100644 --- a/kagenti-operator/internal/webhook/injector/envoy_template.go +++ b/kagenti-operator/internal/webhook/injector/envoy_template.go @@ -34,6 +34,10 @@ type envoyTemplateData struct { OutboundPort int32 InboundPort int32 ExtProcPort int32 + // SpireEnabled gates the TLS listener for verified fetch. + SpireEnabled bool + // TLSPort is the port for the HTTPS listener (used when SpireEnabled is true). + TLSPort int32 } // Default ext-proc gRPC port (go-processor). From eb780c31102a3f7d0ff2d5beb34c443a3fa0ff65 Mon Sep 17 00:00:00 2001 From: Kevin Cogan Date: Thu, 7 May 2026 23:03:37 +0100 Subject: [PATCH 14/16] fix: resolve lint errors in test-tls-agent - Check write error returns in HTTP handlers - Suppress errcheck on deferred Close and Shutdown calls - Remove redundant fmt.Sprintf for string argument Signed-off-by: Kevin Cogan --- kagenti-operator/cmd/test-tls-agent/main.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kagenti-operator/cmd/test-tls-agent/main.go b/kagenti-operator/cmd/test-tls-agent/main.go index 68ea1c64..a98d8946 100644 --- a/kagenti-operator/cmd/test-tls-agent/main.go +++ b/kagenti-operator/cmd/test-tls-agent/main.go @@ -51,11 +51,11 @@ func main() { mux := http.NewServeMux() mux.HandleFunc("/.well-known/agent-card.json", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") - w.Write(cardJSON) + _, _ = w.Write(cardJSON) }) mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) - w.Write([]byte("ok")) + _, _ = w.Write([]byte("ok")) }) ctx, cancel := context.WithCancel(context.Background()) @@ -89,7 +89,7 @@ func main() { if err != nil { log.Fatalf("Failed to create X509Source: %v", err) } - defer x509Source.Close() + defer x509Source.Close() //nolint:errcheck svid, err := x509Source.GetX509SVID() if err != nil { @@ -124,8 +124,8 @@ func main() { shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second) defer shutdownCancel() - httpServer.Shutdown(shutdownCtx) - tlsServer.Shutdown(shutdownCtx) + _ = httpServer.Shutdown(shutdownCtx) + _ = tlsServer.Shutdown(shutdownCtx) wg.Wait() } @@ -140,7 +140,7 @@ func buildAgentCard(agentName, namespace string) []byte { } card := map[string]interface{}{ - "name": fmt.Sprintf("%s", agentName), + "name": agentName, "description": "TLS test agent for Phase 1 verified fetch testing", "url": fmt.Sprintf("https://%s.%s.svc.cluster.local:8443", agentName, namespace), "version": "1.0.0", From 96ce6ec3ba90534ee29a1dc2059c0090fd40bcc8 Mon Sep 17 00:00:00 2001 From: Kevin Cogan Date: Wed, 13 May 2026 12:43:56 +0100 Subject: [PATCH 15/16] fix: address reviewer feedback on mTLS verified fetch - Return error from NewSpiffeFetcher on invalid trust domain - Fix wrong error variable returned in legacy endpoint fallback - Prefer VerifiedChains over PeerCertificates for SPIFFE ID extraction - Emit Warning Event when falling back to unverified HTTP fetch - Document mTLS precedence over JWS in evaluateTrust - Fix unused parameter lint warning in test-tls-agent Signed-off-by: Kevin Cogan --- kagenti-operator/cmd/main.go | 7 ++++- kagenti-operator/cmd/test-tls-agent/main.go | 4 +-- .../internal/agentcard/fetcher.go | 30 +++++++++++++------ .../agentcard/fetcher_verified_test.go | 15 ++++++++-- .../controller/agentcard_controller.go | 8 ++++- 5 files changed, 48 insertions(+), 16 deletions(-) diff --git a/kagenti-operator/cmd/main.go b/kagenti-operator/cmd/main.go index a5f186e2..76f248d1 100644 --- a/kagenti-operator/cmd/main.go +++ b/kagenti-operator/cmd/main.go @@ -361,7 +361,12 @@ func main() { "--spire-trust-domain is required when --enable-verified-fetch=true") os.Exit(1) } - authenticatedFetcher = agentcard.NewSpiffeFetcher(fetchX509Source, td) + fetcher, fetcherErr := agentcard.NewSpiffeFetcher(fetchX509Source, td) + if fetcherErr != nil { + setupLog.Error(fetcherErr, "Failed to create authenticated fetcher") + os.Exit(1) + } + authenticatedFetcher = fetcher defer fetchX509Source.Close() //nolint:errcheck setupLog.Info("Verified fetch enabled (mTLS via SPIFFE)", "socket", verifiedFetchSpiffeSocket, diff --git a/kagenti-operator/cmd/test-tls-agent/main.go b/kagenti-operator/cmd/test-tls-agent/main.go index a98d8946..503d077c 100644 --- a/kagenti-operator/cmd/test-tls-agent/main.go +++ b/kagenti-operator/cmd/test-tls-agent/main.go @@ -49,11 +49,11 @@ func main() { cardJSON := buildAgentCard(agentName, podNamespace) mux := http.NewServeMux() - mux.HandleFunc("/.well-known/agent-card.json", func(w http.ResponseWriter, r *http.Request) { + mux.HandleFunc("/.well-known/agent-card.json", func(w http.ResponseWriter, _ *http.Request) { w.Header().Set("Content-Type", "application/json") _, _ = w.Write(cardJSON) }) - mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { + mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) _, _ = w.Write([]byte("ok")) }) diff --git a/kagenti-operator/internal/agentcard/fetcher.go b/kagenti-operator/internal/agentcard/fetcher.go index 68dd003a..b1fc8290 100644 --- a/kagenti-operator/internal/agentcard/fetcher.go +++ b/kagenti-operator/internal/agentcard/fetcher.go @@ -103,8 +103,7 @@ func (f *DefaultFetcher) fetchA2ACard(ctx context.Context, serviceURL string) (* card, legacyErr := f.fetchAgentCardFromPath(ctx, serviceURL, A2ALegacyAgentCardPath) if legacyErr != nil { - // Return the original error since the primary path is canonical. - return nil, err + return nil, legacyErr } fetcherLogger.Info("Agent card served from deprecated endpoint", @@ -256,8 +255,13 @@ type SpiffeFetcher struct { // NewSpiffeFetcher creates a SpiffeFetcher that uses the provided X509Source // for mTLS and validates peers against the given trust domain. -func NewSpiffeFetcher(source *workloadapi.X509Source, trustDomain string) *SpiffeFetcher { - td, _ := spiffeid.TrustDomainFromString(trustDomain) +func NewSpiffeFetcher( + source *workloadapi.X509Source, trustDomain string, +) (*SpiffeFetcher, error) { + td, err := spiffeid.TrustDomainFromString(trustDomain) + if err != nil { + return nil, fmt.Errorf("invalid SPIFFE trust domain %q: %w", trustDomain, err) + } tlsCfg := tlsconfig.MTLSClientConfig(source, source, tlsconfig.AuthorizeMemberOf(td)) return &SpiffeFetcher{ x509Source: source, @@ -266,7 +270,7 @@ func NewSpiffeFetcher(source *workloadapi.X509Source, trustDomain string) *Spiff Timeout: DefaultFetchTimeout, Transport: &http.Transport{TLSClientConfig: tlsCfg}, }, - } + }, nil } func (f *SpiffeFetcher) FetchAuthenticated(ctx context.Context, protocol, serviceURL string) (*FetchResult, error) { @@ -294,7 +298,7 @@ func (f *SpiffeFetcher) fetchA2ACardAuthenticated(ctx context.Context, serviceUR result, legacyErr := f.fetchAuthenticatedFromPath(ctx, serviceURL, A2ALegacyAgentCardPath) if legacyErr != nil { - return nil, err + return nil, legacyErr } fetcherLogger.Info("Agent card served from deprecated endpoint (authenticated)", @@ -333,12 +337,20 @@ func (f *SpiffeFetcher) fetchAuthenticatedFromPath(ctx context.Context, serviceU }, nil } -// extractSpiffeIDFromTLS returns the SPIFFE ID from the peer certificate's URI SANs. +// extractSpiffeIDFromTLS returns the SPIFFE ID from the verified peer +// certificate's URI SANs. Prefers VerifiedChains (post-validation) over +// PeerCertificates (pre-validation) for defense-in-depth. func extractSpiffeIDFromTLS(state *tls.ConnectionState) string { - if state == nil || len(state.PeerCertificates) == 0 { + if state == nil { return "" } - return spiffeIDFromCert(state.PeerCertificates[0]) + if len(state.VerifiedChains) > 0 && len(state.VerifiedChains[0]) > 0 { + return spiffeIDFromCert(state.VerifiedChains[0][0]) + } + if len(state.PeerCertificates) > 0 { + return spiffeIDFromCert(state.PeerCertificates[0]) + } + return "" } func spiffeIDFromCert(cert *x509.Certificate) string { diff --git a/kagenti-operator/internal/agentcard/fetcher_verified_test.go b/kagenti-operator/internal/agentcard/fetcher_verified_test.go index 7e1cb7a3..3c7d22f4 100644 --- a/kagenti-operator/internal/agentcard/fetcher_verified_test.go +++ b/kagenti-operator/internal/agentcard/fetcher_verified_test.go @@ -66,15 +66,24 @@ func TestSpiffeIDFromCert_MultipleURIs(t *testing.T) { func TestNewSpiffeFetcher(t *testing.T) { g := gomega.NewGomegaWithT(t) - fetcher := NewSpiffeFetcher(nil, "example.org") + fetcher, err := NewSpiffeFetcher(nil, "example.org") + g.Expect(err).NotTo(gomega.HaveOccurred()) g.Expect(fetcher).NotTo(gomega.BeNil()) g.Expect(fetcher.trustDomain).To(gomega.Equal("example.org")) } +func TestNewSpiffeFetcher_InvalidTrustDomain(t *testing.T) { + g := gomega.NewGomegaWithT(t) + _, err := NewSpiffeFetcher(nil, "") + g.Expect(err).To(gomega.HaveOccurred()) + g.Expect(err.Error()).To(gomega.ContainSubstring("invalid SPIFFE trust domain")) +} + func TestSpiffeFetcher_UnsupportedProtocol(t *testing.T) { g := gomega.NewGomegaWithT(t) - fetcher := NewSpiffeFetcher(nil, "example.org") - _, err := fetcher.FetchAuthenticated(t.Context(), "unsupported", "https://example.com") + fetcher, err := NewSpiffeFetcher(nil, "example.org") + g.Expect(err).NotTo(gomega.HaveOccurred()) + _, err = fetcher.FetchAuthenticated(t.Context(), "unsupported", "https://example.com") g.Expect(err).To(gomega.HaveOccurred()) g.Expect(err.Error()).To(gomega.ContainSubstring("unsupported protocol")) } diff --git a/kagenti-operator/internal/controller/agentcard_controller.go b/kagenti-operator/internal/controller/agentcard_controller.go index bcb10d4d..da924d26 100644 --- a/kagenti-operator/internal/controller/agentcard_controller.go +++ b/kagenti-operator/internal/controller/agentcard_controller.go @@ -349,6 +349,9 @@ func (r *AgentCardReconciler) fetchCardData( } agentCardLogger.Info("TLS port not found on service, falling back to HTTP fetch", "service", workload.ServiceName, "expectedPortName", AgentTLSPortName) + r.Recorder.Event(agentCard, corev1.EventTypeWarning, "FallbackToHTTP", + fmt.Sprintf("Service %s has no %s port; fetch is unverified", + workload.ServiceName, AgentTLSPortName)) cardData, err := r.AgentFetcher.Fetch( ctx, protocol, serviceURL, workload.ServiceName, agentCard.Namespace) if err != nil { @@ -438,7 +441,10 @@ func (r *AgentCardReconciler) evaluateTrust( //nolint:gocyclo eval.identityMatch = &match } - // Compute final Verified status + // Compute final Verified status. + // mTLS takes precedence over JWS: if the agent was authenticated via mTLS, + // it is unconditionally Verified regardless of JWS state. This avoids + // ambiguity when both mechanisms succeed with different SPIFFE IDs. if eval.isMTLSVerified { eval.verifiedStatus = metav1.ConditionTrue eval.verifiedReason = "mTLSVerified" From c2e4866bd3c354e0a46e26b922144c9b6d8535ff Mon Sep 17 00:00:00 2001 From: Kevin Cogan Date: Mon, 18 May 2026 13:59:15 +0100 Subject: [PATCH 16/16] fix: guard nil Recorder and return error from cleanupVerifiedFetchFields - Add nil check on r.Recorder.Event() in FallbackToHTTP path - Change cleanupVerifiedFetchFields to return error to caller - Caller logs and continues without failing the reconcile loop Signed-off-by: Kevin Cogan --- .../controller/agentcard_controller.go | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/kagenti-operator/internal/controller/agentcard_controller.go b/kagenti-operator/internal/controller/agentcard_controller.go index da924d26..892304c3 100644 --- a/kagenti-operator/internal/controller/agentcard_controller.go +++ b/kagenti-operator/internal/controller/agentcard_controller.go @@ -349,9 +349,11 @@ func (r *AgentCardReconciler) fetchCardData( } agentCardLogger.Info("TLS port not found on service, falling back to HTTP fetch", "service", workload.ServiceName, "expectedPortName", AgentTLSPortName) - r.Recorder.Event(agentCard, corev1.EventTypeWarning, "FallbackToHTTP", - fmt.Sprintf("Service %s has no %s port; fetch is unverified", - workload.ServiceName, AgentTLSPortName)) + if r.Recorder != nil { + r.Recorder.Event(agentCard, corev1.EventTypeWarning, "FallbackToHTTP", + fmt.Sprintf("Service %s has no %s port; fetch is unverified", + workload.ServiceName, AgentTLSPortName)) + } cardData, err := r.AgentFetcher.Fetch( ctx, protocol, serviceURL, workload.ServiceName, agentCard.Namespace) if err != nil { @@ -377,7 +379,9 @@ func (r *AgentCardReconciler) fetchCardData( } return nil, nil, err } - r.cleanupVerifiedFetchFields(ctx, agentCard) + if err := r.cleanupVerifiedFetchFields(ctx, agentCard); err != nil { + agentCardLogger.Error(err, "Failed to cleanup verified fetch fields", "agentCard", agentCard.Name) + } return cardData, nil, nil } @@ -1057,23 +1061,22 @@ func (r *AgentCardReconciler) propagateVerifiedLabel( } // cleanupVerifiedFetchFields removes stale Phase 1 fields when verifiedFetch is disabled. -func (r *AgentCardReconciler) cleanupVerifiedFetchFields(ctx context.Context, agentCard *agentv1alpha1.AgentCard) { +func (r *AgentCardReconciler) cleanupVerifiedFetchFields(ctx context.Context, agentCard *agentv1alpha1.AgentCard) error { latest := &agentv1alpha1.AgentCard{} if err := r.Get(ctx, types.NamespacedName{ Name: agentCard.Name, Namespace: agentCard.Namespace, }, latest); err != nil { - agentCardLogger.Error(err, "Failed to get AgentCard for cleanup", "agentCard", agentCard.Name) - return + return err } verifiedCond := meta.FindStatusCondition(latest.Status.Conditions, ConditionVerified) needsUpdate := verifiedCond != nil || latest.Status.AttestedAgentSpiffeID != "" if !needsUpdate { - return + return nil } - if err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + return retry.RetryOnConflict(retry.DefaultRetry, func() error { if err := r.Get(ctx, types.NamespacedName{ Name: agentCard.Name, Namespace: agentCard.Namespace, @@ -1083,9 +1086,7 @@ func (r *AgentCardReconciler) cleanupVerifiedFetchFields(ctx context.Context, ag meta.RemoveStatusCondition(&latest.Status.Conditions, ConditionVerified) latest.Status.AttestedAgentSpiffeID = "" return r.Status().Update(ctx, latest) - }); err != nil { - agentCardLogger.Error(err, "Failed to cleanup verified fetch fields", "agentCard", agentCard.Name) - } + }) } func (r *AgentCardReconciler) handleDeletion(ctx context.Context, agentCard *agentv1alpha1.AgentCard) (ctrl.Result, error) {