From 8c335e9f756905b0de6cbecb1fbe42040234afb6 Mon Sep 17 00:00:00 2001 From: Andrey Kolkov Date: Thu, 4 Jun 2026 12:54:39 +0400 Subject: [PATCH] feat(main): add e2e with kamaji case --- .github/workflows/ci.yml | 5 +- .github/workflows/e2e.yml | 48 +++ .github/workflows/publish.yml | 4 +- Makefile | 8 + cmd/kubectl-etcd/helpers_test.go | 252 ++++++++++++++ config/default/manager_auth_proxy_patch.yaml | 5 +- controllers/etcdcluster_controller.go | 13 +- controllers/etcdcluster_controller_test.go | 63 ++++ hack/e2e.sh | 117 +++++++ test/e2e/kamaji_datastore_test.go | 346 +++++++++++++++++++ test/e2e/suite_test.go | 213 ++++++++++++ test/e2e/testdata/00-namespace.yaml | 4 + test/e2e/testdata/01-pki.yaml | 59 ++++ test/e2e/testdata/02-etcdcluster.yaml | 24 ++ test/e2e/testdata/03-datastore.yaml | 35 ++ test/e2e/testdata/04-tenantcontrolplane.yaml | 22 ++ 16 files changed, 1213 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/e2e.yml create mode 100644 cmd/kubectl-etcd/helpers_test.go create mode 100755 hack/e2e.sh create mode 100644 test/e2e/kamaji_datastore_test.go create mode 100644 test/e2e/suite_test.go create mode 100644 test/e2e/testdata/00-namespace.yaml create mode 100644 test/e2e/testdata/01-pki.yaml create mode 100644 test/e2e/testdata/02-etcdcluster.yaml create mode 100644 test/e2e/testdata/03-datastore.yaml create mode 100644 test/e2e/testdata/04-tenantcontrolplane.yaml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7b50150e..b097c3e7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,10 +1,11 @@ name: CI on: + # No PR branch filter: PRs here target main and integration branches (e.g. + # v1) alike; the old `master` filter matched nothing, so CI never ran on PRs. pull_request: - branches: [ master ] push: - branches: [ master ] + branches: [ main ] jobs: verify: diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml new file mode 100644 index 00000000..51c25b52 --- /dev/null +++ b/.github/workflows/e2e.yml @@ -0,0 +1,48 @@ +name: E2E + +on: + # No PR branch filter: PRs in this repo target main as well as integration + # branches (e.g. v1), and the suite must gate all of them. A filter naming + # a branch PRs never target (this said `master`, which does not exist) means + # the workflow never runs at all. + # + # paths-ignore: a kind+cert-manager+Kamaji provisioning run costs ~30-45 + # minutes; skip it for PRs that touch nothing the suite exercises + # (docs-only changes). Everything else — Go code, manifests, the harness, + # the workflows themselves — still gates. + pull_request: + paths-ignore: + - '**.md' + - 'docs/**' + push: + tags: [ "v*" ] + workflow_dispatch: + +concurrency: + group: e2e-${{ github.ref }} + cancel-in-progress: true + +jobs: + kamaji-datastore: + runs-on: ubuntu-latest + timeout-minutes: 45 + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: true + + - name: e2e (kind + cert-manager + Kamaji) + # Provisions an ephemeral kind cluster, installs cert-manager and + # Kamaji at the versions pinned in hack/e2e.sh, builds the operator + # from this checkout and runs test/e2e — the Kamaji DataStore + # consumer scenario. + # + # On failure, hack/e2e.sh's EXIT trap dumps the cluster state + # (CRs, pods, operator and Kamaji logs) into this step's output + # BEFORE tearing the kind cluster down. Don't add a separate + # dump step here: by the time it would run, the trap has already + # deleted the cluster and every kubectl call would come up empty. + run: make e2e diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 54e0bc6e..01addceb 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -2,7 +2,9 @@ name: Build and Publish Docker Image on: push: - branches: [ master ] + # The default branch is main; the old `master` filter matched nothing, + # so images were never published. + branches: [ main ] jobs: build-and-publish: diff --git a/Makefile b/Makefile index 49e76dbf..c6a45a40 100644 --- a/Makefile +++ b/Makefile @@ -61,6 +61,14 @@ vet: ## Run go vet against code. test: manifests generate fmt vet envtest ## Run tests. KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./... -coverprofile cover.out +.PHONY: test-e2e +test-e2e: ## Run the e2e suite against the current kubeconfig context (expects cert-manager, Kamaji and the operator installed; see hack/e2e.sh). + go test -tags e2e -count=1 ./test/e2e/ -v -timeout 30m + +.PHONY: e2e +e2e: ## Provision a kind cluster with cert-manager and Kamaji, deploy the operator, run the e2e suite. KEEP_CLUSTER=1 keeps the cluster for debugging. + hack/e2e.sh + ##@ Build .PHONY: build diff --git a/cmd/kubectl-etcd/helpers_test.go b/cmd/kubectl-etcd/helpers_test.go new file mode 100644 index 00000000..60739b01 --- /dev/null +++ b/cmd/kubectl-etcd/helpers_test.go @@ -0,0 +1,252 @@ +package main + +import ( + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "errors" + "fmt" + "math/big" + "strings" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +// ── Blocker #2: a failed/never-ready port-forward must not hang ────────────── + +func TestAwaitForward_Ready(t *testing.T) { + ready := make(chan struct{}, 1) + close(ready) + if err := awaitForward(ready, make(chan error, 1), make(chan struct{}, 1), time.Second); err != nil { + t.Fatalf("ready forward should succeed, got %v", err) + } +} + +func TestAwaitForward_ErrorBeforeReady(t *testing.T) { + // ForwardPorts returns an error without ever closing readyChan — the old + // code blocked on <-readyChan forever. awaitForward must return the error. + forwardErr := make(chan error, 1) + forwardErr <- fmt.Errorf("dial tcp: connection refused") + err := awaitForward(make(chan struct{}), forwardErr, make(chan struct{}, 1), time.Second) + if err == nil { + t.Fatal("a forward failure must return an error, not hang or succeed") + } + if !strings.Contains(err.Error(), "connection refused") { + t.Errorf("error should wrap the forward failure, got %v", err) + } +} + +func TestAwaitForward_Timeout(t *testing.T) { + stop := make(chan struct{}, 1) + // Nothing ever signals ready and no error arrives → must time out, not hang. + err := awaitForward(make(chan struct{}), make(chan error, 1), stop, 10*time.Millisecond) + if err == nil { + t.Fatal("a never-ready forward must time out, not hang") + } + select { + case <-stop: + default: + t.Error("timeout must close stopChan to tear the forwarder down") + } +} + +// ── Blocker #3: TLS discovery and credential loading ──────────────────────── + +// etcdTLSPod returns a Pod whose etcd container points --trusted-ca-file at a +// secret-backed volume mount, mirroring what the operator builds. +func etcdTLSPod() (*corev1.Pod, corev1.Container) { + c := corev1.Container{ + Name: "etcd", + Command: []string{"etcd"}, + Args: []string{"--trusted-ca-file=/etc/etcd/pki/ca/ca.crt"}, + VolumeMounts: []corev1.VolumeMount{{Name: "ca", MountPath: "/etc/etcd/pki/ca"}}, + } + pod := &corev1.Pod{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{c}, + Volumes: []corev1.Volume{{ + Name: "ca", + VolumeSource: corev1.VolumeSource{Secret: &corev1.SecretVolumeSource{SecretName: "etcd-ca"}}, + }}, + }, + } + return pod, c +} + +func TestFindSecretNameForTLS_Happy(t *testing.T) { + pod, c := etcdTLSPod() + name, err := findSecretNameForTLS(pod, c) + if err != nil { + t.Fatal(err) + } + if name != "etcd-ca" { + t.Errorf("secret name = %q, want %q", name, "etcd-ca") + } +} + +func TestFindSecretNameForTLS_NoCAFlag(t *testing.T) { + c := corev1.Container{Name: "etcd", Args: []string{"--listen-client-urls=http://0.0.0.0:2379"}} + pod := &corev1.Pod{Spec: corev1.PodSpec{Containers: []corev1.Container{c}}} + _, err := findSecretNameForTLS(pod, c) + if !errors.Is(err, errNoTrustedCAFile) { + t.Errorf("expected errNoTrustedCAFile sentinel, got %v", err) + } +} + +func TestFindSecretNameForTLS_MountNotFound(t *testing.T) { + c := corev1.Container{Name: "etcd", Args: []string{"--trusted-ca-file=/nowhere/ca.crt"}} + pod := &corev1.Pod{Spec: corev1.PodSpec{Containers: []corev1.Container{c}}} + if _, err := findSecretNameForTLS(pod, c); err == nil { + t.Error("expected an error when no volume backs the CA path") + } +} + +func TestLoadCredentials_Happy(t *testing.T) { + cs := fake.NewSimpleClientset(&corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "creds", Namespace: "ns1"}, + Data: map[string][]byte{ + corev1.BasicAuthUsernameKey: []byte("root"), + corev1.BasicAuthPasswordKey: []byte("s3cret"), + }, + }) + u, p, err := loadCredentials(cs, "ns1", "creds") + if err != nil { + t.Fatal(err) + } + if u != "root" || p != "s3cret" { + t.Errorf("got %q/%q, want root/s3cret", u, p) + } +} + +func TestLoadCredentials_DefaultsUsernameToRoot(t *testing.T) { + cs := fake.NewSimpleClientset(&corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "creds", Namespace: "ns1"}, + Data: map[string][]byte{corev1.BasicAuthPasswordKey: []byte("p")}, + }) + u, _, err := loadCredentials(cs, "ns1", "creds") + if err != nil { + t.Fatal(err) + } + if u != "root" { + t.Errorf("username default = %q, want root", u) + } +} + +func TestLoadCredentials_MissingPassword(t *testing.T) { + cs := fake.NewSimpleClientset(&corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "creds", Namespace: "ns1"}, + Data: map[string][]byte{corev1.BasicAuthUsernameKey: []byte("root")}, + }) + if _, _, err := loadCredentials(cs, "ns1", "creds"); err == nil { + t.Error("expected an error when the password key is missing/empty") + } +} + +func TestLoadCredentials_NamespacedRef(t *testing.T) { + cs := fake.NewSimpleClientset(&corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "creds", Namespace: "other"}, + Data: map[string][]byte{corev1.BasicAuthPasswordKey: []byte("p")}, + }) + // Default namespace is ns1, but the "other/creds" ref must override it. + _, p, err := loadCredentials(cs, "ns1", "other/creds") + if err != nil || p != "p" { + t.Errorf("namespace/name ref not honored: p=%q err=%v", p, err) + } +} + +func TestExtractTLSFiles_Happy(t *testing.T) { + cert, key := selfSignedPEM(t) + cs := fake.NewSimpleClientset(&corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "etcd-ca", Namespace: "ns1"}, + Data: map[string][]byte{"ca.crt": cert, "tls.crt": cert, "tls.key": key}, + }) + pool, clientCert, err := extractTLSFiles(cs, "ns1", "etcd-ca") + if err != nil { + t.Fatal(err) + } + if pool == nil || clientCert == nil { + t.Fatal("expected a non-nil CA pool and client certificate") + } +} + +func TestExtractTLSFiles_MissingCA(t *testing.T) { + cs := fake.NewSimpleClientset(&corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "etcd-ca", Namespace: "ns1"}, + Data: map[string][]byte{"tls.crt": []byte("x"), "tls.key": []byte("y")}, + }) + if _, _, err := extractTLSFiles(cs, "ns1", "etcd-ca"); err == nil { + t.Error("expected an error when ca.crt is absent from the secret") + } +} + +func TestGetTLSConfig_Plaintext(t *testing.T) { + c := corev1.Container{Name: "etcd", Args: []string{"--listen-client-urls=http://0.0.0.0:2379"}} + pod := &corev1.Pod{Spec: corev1.PodSpec{Containers: []corev1.Container{c}}} + cfg, err := getTLSConfig(fake.NewSimpleClientset(), pod, "ns1") + if err != nil { + t.Fatal(err) + } + if cfg != nil { + t.Errorf("a plaintext cluster should yield a nil TLS config, got %+v", cfg) + } +} + +func TestGetTLSConfig_TLS(t *testing.T) { + cert, key := selfSignedPEM(t) + pod, _ := etcdTLSPod() + cs := fake.NewSimpleClientset(&corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "etcd-ca", Namespace: "ns1"}, + Data: map[string][]byte{"ca.crt": cert, "tls.crt": cert, "tls.key": key}, + }) + cfg, err := getTLSConfig(cs, pod, "ns1") + if err != nil { + t.Fatal(err) + } + if cfg == nil { + t.Fatal("expected a TLS config for a TLS-enabled cluster") + } + if cfg.MinVersion != tls.VersionTLS12 { + t.Errorf("MinVersion = %d, want TLS 1.2 (%d)", cfg.MinVersion, tls.VersionTLS12) + } + if cfg.RootCAs == nil || len(cfg.Certificates) != 1 { + t.Error("expected a populated RootCAs pool and exactly one client certificate") + } +} + +// selfSignedPEM returns a fresh self-signed certificate and its EC private key, +// PEM-encoded — enough to satisfy x509 parsing and tls.X509KeyPair in tests. +func selfSignedPEM(t *testing.T) (certPEM, keyPEM []byte) { + t.Helper() + priv, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + t.Fatalf("generate key: %v", err) + } + tmpl := &x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{CommonName: "test-etcd"}, + NotBefore: time.Now().Add(-time.Hour), + NotAfter: time.Now().Add(time.Hour), + KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageCertSign, + BasicConstraintsValid: true, + IsCA: true, + } + der, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, &priv.PublicKey, priv) + if err != nil { + t.Fatalf("create certificate: %v", err) + } + certPEM = pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der}) + keyDER, err := x509.MarshalECPrivateKey(priv) + if err != nil { + t.Fatalf("marshal key: %v", err) + } + keyPEM = pem.EncodeToMemory(&pem.Block{Type: "EC PRIVATE KEY", Bytes: keyDER}) + return certPEM, keyPEM +} diff --git a/config/default/manager_auth_proxy_patch.yaml b/config/default/manager_auth_proxy_patch.yaml index b7512661..c5cb56ab 100644 --- a/config/default/manager_auth_proxy_patch.yaml +++ b/config/default/manager_auth_proxy_patch.yaml @@ -31,7 +31,10 @@ spec: capabilities: drop: - "ALL" - image: gcr.io/kubebuilder/kube-rbac-proxy:v0.13.1 + # gcr.io/kubebuilder/kube-rbac-proxy is gone (the gcr.io/kubebuilder + # registry shut down in early 2025); quay.io/brancz is the upstream + # home of kube-rbac-proxy that kubebuilder pointed users to. + image: quay.io/brancz/kube-rbac-proxy:v0.18.2 args: - "--secure-listen-address=0.0.0.0:8443" - "--upstream=http://127.0.0.1:8080/" diff --git a/controllers/etcdcluster_controller.go b/controllers/etcdcluster_controller.go index 32a9d33d..18090d1d 100644 --- a/controllers/etcdcluster_controller.go +++ b/controllers/etcdcluster_controller.go @@ -452,8 +452,19 @@ func (r *EtcdClusterReconciler) bootstrap( // Pod label on the first reconcile of the seed Pod, so the PDB // selects it from creation rather than only after MemberList runs // a few reconciles later. + // + // Written as a merge patch, not an Update: the member controller + // reconciles the seed the moment it is created and its own + // Status().Update races with this write — an optimistic-locked + // Update here loses that race with a conflict, and a cache-backed + // re-Get cannot recover (the informer may not even have the + // object yet). This stamp happens only in the create branch — + // losing it would leave the seed permanently IsVoter=false, and + // both learner memberID discovery and promotion filter their dial + // endpoints to voters, wedging every later scale-up. + stampBase := seed.DeepCopy() seed.Status.IsVoter = true - if err := r.Status().Update(ctx, seed); err != nil { + if err := r.Status().Patch(ctx, seed, client.MergeFrom(stampBase)); err != nil { return ctrl.Result{}, err } } else if !metav1.IsControlledBy(seed, cluster) { diff --git a/controllers/etcdcluster_controller_test.go b/controllers/etcdcluster_controller_test.go index a767e285..4f28f18b 100644 --- a/controllers/etcdcluster_controller_test.go +++ b/controllers/etcdcluster_controller_test.go @@ -34,6 +34,8 @@ import ( "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" lll "github.com/cozystack/etcd-operator/api/v1alpha2" ) @@ -110,6 +112,67 @@ func TestBootstrap_CreatesSingleSeedMember(t *testing.T) { if !seed.Spec.Bootstrap { t.Fatalf("seed member must be marked Bootstrap=true") } + // The seed is pre-stamped IsVoter=true at create time (it forms the + // cluster on its own — never a learner). Losing the stamp wedges every + // later scale-up: learner memberID discovery and promotion dial voters + // only. + if !seed.Status.IsVoter { + t.Fatalf("seed Status.IsVoter must be pre-stamped true at create; got false") + } +} + +// TestBootstrap_SeedIsVoterStampSurvivesConcurrentStatusWriter pins the +// merge-patch choice in the seed IsVoter pre-stamp +// (etcdcluster_controller.go bootstrap). The member controller starts +// reconciling the seed the moment Create returns, and its own +// Status().Update bumps the resourceVersion concurrently with this stamp. +// An optimistic-locked Status().Update here loses that race with a +// Conflict, and — because the stamp lives only in the create branch — the +// seed would stay IsVoter=false forever. The interceptor models the +// concurrent writer by failing every optimistic-locked EtcdMember status +// Update with a Conflict; the merge patch carries no resourceVersion and +// must land regardless. This test fails against the previous +// Status().Update implementation. +func TestBootstrap_SeedIsVoterStampSurvivesConcurrentStatusWriter(t *testing.T) { + ctx := context.Background() + cluster := &lll.EtcdCluster{ + ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "ns"}, + Spec: lll.EtcdClusterSpec{ + Replicas: ptrInt32(3), + Version: "3.5.17", + Storage: lll.StorageSpec{Size: quickQty(t, "1Gi")}, + }, + } + s := testScheme(t) + c := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(cluster). + WithStatusSubresource(&lll.EtcdCluster{}, &lll.EtcdMember{}, &lll.EtcdSnapshot{}). + WithInterceptorFuncs(interceptor.Funcs{ + SubResourceUpdate: func(ctx context.Context, cl client.Client, sub string, obj client.Object, opts ...client.SubResourceUpdateOption) error { + if _, isMember := obj.(*lll.EtcdMember); isMember && sub == "status" { + return apierrors.NewConflict( + schema.GroupResource{Group: lll.GroupVersion.Group, Resource: "etcdmembers"}, + obj.GetName(), errors.New("simulated concurrent status writer")) + } + return cl.SubResource(sub).Update(ctx, obj, opts...) + }, + }). + Build() + r := &EtcdClusterReconciler{Client: c, Scheme: s, EtcdClientFactory: factoryReturning(newFakeEtcd(0xdeadbeef))} + + reconcileUntilStable(t, r, c, "test", "ns", 8) + + members := &lll.EtcdMemberList{} + if err := c.List(ctx, members, client.InNamespace("ns")); err != nil { + t.Fatalf("List: %v", err) + } + if len(members.Items) != 1 { + t.Fatalf("expected one seed member; got %d", len(members.Items)) + } + if !members.Items[0].Status.IsVoter { + t.Fatal("seed IsVoter stamp lost under a concurrent status writer; the stamp must be a merge patch, not an optimistic-locked Update") + } } // TestObservedSpec_LocksMidFlight pins the locking pattern's correctness diff --git a/hack/e2e.sh b/hack/e2e.sh new file mode 100755 index 00000000..3ecca865 --- /dev/null +++ b/hack/e2e.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +# End-to-end harness: provisions a kind cluster with cert-manager and Kamaji, +# builds and deploys the operator from the working tree, then runs the e2e +# suite (test/e2e). Destroys the kind cluster on exit unless KEEP_CLUSTER=1. +# +# Requirements on the host: docker, kubectl, helm, go. kind is installed +# into ./bin automatically when missing. +set -euo pipefail + +# Always build/pull for the host architecture. A user-level +# DOCKER_DEFAULT_PLATFORM=linux/amd64 (common on Apple Silicon for x86-only +# tooling) would otherwise pull an emulated kind node whose control plane +# never becomes healthy. +unset DOCKER_DEFAULT_PLATFORM + +# ── Pinned component versions ──────────────────────────────────────────── +# v0.32.0+ required with Docker 29 ("failed to detect containerd +# snapshotter" on `kind load` with older releases). +KIND_VERSION=v0.32.0 +KIND_NODE_IMAGE=kindest/node:v1.34.0 +CERT_MANAGER_VERSION=v1.18.2 +KAMAJI_CHART_VERSION=1.0.0 +# The TenantControlPlane Kubernetes version lives in +# test/e2e/testdata/04-tenantcontrolplane.yaml and must stay within the skew +# supported by KAMAJI_CHART_VERSION. + +KIND_CLUSTER_NAME=${KIND_CLUSTER_NAME:-etcd-operator-e2e} +IMG=${IMG:-etcd-operator:e2e} +KEEP_CLUSTER=${KEEP_CLUSTER:-} + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$ROOT" +LOCALBIN="$ROOT/bin" +mkdir -p "$LOCALBIN" +export PATH="$LOCALBIN:$PATH" + +if ! command -v kind >/dev/null 2>&1; then + echo "--- installing kind $KIND_VERSION into $LOCALBIN" + GOBIN="$LOCALBIN" go install sigs.k8s.io/kind@"$KIND_VERSION" +fi + +# dump_diagnostics prints cluster state for post-mortem debugging. It MUST run +# from the EXIT trap, before the kind cluster is deleted — a separate CI step +# after this script cannot do it, because by the time the step runs the trap +# has already torn the cluster down and every kubectl call would fail. +dump_diagnostics() { + echo "--- e2e failed; dumping cluster state before teardown" + kubectl get etcdclusters,etcdmembers,pods,certificates,secrets -A || true + kubectl get datastores,tenantcontrolplanes -A || true + kubectl -n etcd-operator-system logs deploy/etcd-operator-controller-manager --tail=200 || true + kubectl -n kamaji-system logs deploy/kamaji --tail=200 || true + # The tenant namespace is where the longest wait (TenantControlPlane + # Ready) fails — dump every pod's logs there, or the one failure mode + # the suite is most likely to hit leaves no trace. + for p in $(kubectl -n kamaji-e2e get pods -o name 2>/dev/null); do + echo "--- logs: kamaji-e2e/$p" + kubectl -n kamaji-e2e logs "$p" --all-containers --tail=100 || true + done +} + +cleanup() { + status=$? + if [ "$status" -ne 0 ]; then + dump_diagnostics + fi + if [ -n "$KEEP_CLUSTER" ]; then + echo "--- KEEP_CLUSTER set; kind cluster '$KIND_CLUSTER_NAME' left running" + return + fi + echo "--- deleting kind cluster '$KIND_CLUSTER_NAME'" + kind delete cluster --name "$KIND_CLUSTER_NAME" >/dev/null 2>&1 || true +} +trap cleanup EXIT + +echo "--- creating kind cluster '$KIND_CLUSTER_NAME' ($KIND_NODE_IMAGE)" +kind create cluster --name "$KIND_CLUSTER_NAME" --image "$KIND_NODE_IMAGE" --wait 5m +kubectl config use-context "kind-$KIND_CLUSTER_NAME" + +echo "--- installing cert-manager $CERT_MANAGER_VERSION" +kubectl apply -f "https://github.com/cert-manager/cert-manager/releases/download/${CERT_MANAGER_VERSION}/cert-manager.yaml" +kubectl -n cert-manager wait deploy --all --for=condition=Available --timeout=5m + +echo "--- installing Kamaji (chart $KAMAJI_CHART_VERSION)" +# etcd.deploy=false / datastore.enabled=false: Kamaji's bundled kamaji-etcd +# stays out — the DataStore under test is the operator-managed cluster the +# suite creates (test/e2e/testdata/03-datastore.yaml), whose name the +# manager receives as its default via datastore.nameOverride. +helm repo add clastix https://clastix.github.io/charts --force-update >/dev/null +# The kamaji manager exits at startup while its default DataStore is +# missing, and the DataStore cannot be created later because the chart's +# validating webhook (failurePolicy=Fail) is served by that same crashing +# pod. Break the cycle the way the chart itself does for its bundled +# datastore (a pre-install hook): install the CRDs first and create the +# DataStore before the webhook configuration exists. The suite re-applies +# the same fixture later, which is a no-op. +helm show crds clastix/kamaji --version "$KAMAJI_CHART_VERSION" | kubectl apply -f - +kubectl apply -f test/e2e/testdata/03-datastore.yaml +helm upgrade --install kamaji clastix/kamaji \ + --version "$KAMAJI_CHART_VERSION" \ + --namespace kamaji-system --create-namespace \ + --set etcd.deploy=false \ + --set datastore.enabled=false \ + --set datastore.nameOverride=kamaji-e2e \ + --wait --timeout 5m +# NOTE: the literal "kamaji-e2e" above must stay in sync with its two other +# copies: e2eNamespace in test/e2e/kamaji_datastore_test.go and the +# metadata.namespace/DataStore name in test/e2e/testdata/*.yaml. + +echo "--- building and deploying the operator ($IMG)" +docker build -t "$IMG" . +kind load docker-image "$IMG" --name "$KIND_CLUSTER_NAME" +make install deploy IMG="$IMG" +kubectl -n etcd-operator-system wait deploy/etcd-operator-controller-manager \ + --for=condition=Available --timeout=5m + +echo "--- running e2e suite" +make test-e2e diff --git a/test/e2e/kamaji_datastore_test.go b/test/e2e/kamaji_datastore_test.go new file mode 100644 index 00000000..81a0d5dd --- /dev/null +++ b/test/e2e/kamaji_datastore_test.go @@ -0,0 +1,346 @@ +//go:build e2e + +package e2e + +import ( + "bytes" + "context" + "fmt" + "net" + "net/http" + "net/url" + "strings" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + "k8s.io/client-go/tools/portforward" + "k8s.io/client-go/tools/remotecommand" + "k8s.io/client-go/transport/spdy" + "sigs.k8s.io/controller-runtime/pkg/client" + + etcdv1alpha2 "github.com/cozystack/etcd-operator/api/v1alpha2" +) + +const ( + // e2eNamespace must stay in sync with two other copies of the literal: + // the fixtures (test/e2e/testdata/*.yaml, metadata.namespace) and the + // harness (hack/e2e.sh: `datastore.nameOverride=kamaji-e2e` and the + // diagnostics dump). Changing one without the others breaks the suite + // in confusing ways (Kamaji's manager looks up the DataStore by name). + e2eNamespace = "kamaji-e2e" + clusterName = "etcd" + tcpName = "tenant1" + proofName = "e2e-proof" // ConfigMap created via the tenant API, then grepped out of etcd +) + +// TestKamajiDataStore proves the documented consumer story end to end: an +// operator-managed, full-mTLS EtcdCluster serves as the backing store of a +// Kamaji DataStore, a TenantControlPlane comes up on it, its API answers, +// and objects written through the tenant API land as keys in our etcd. +func TestKamajiDataStore(t *testing.T) { + ctx := context.Background() + fixtures := fixturePaths(t) + + // 00-namespace, 01-pki — namespace and the cert-manager PKI. + applyFixture(ctx, t, fixtures[0]) + applyFixture(ctx, t, fixtures[1]) + waitFor(ctx, t, 2*time.Minute, "PKI secrets issued", func(ctx context.Context) error { + if err := secretExists(e2eNamespace, "etcd-ca-tls", "tls.crt", "tls.key")(ctx); err != nil { + return err + } + return secretExists(e2eNamespace, "kamaji-etcd-client-tls", "tls.crt", "tls.key")(ctx) + }) + + // 02-etcdcluster — the cluster under test. + applyFixture(ctx, t, fixtures[2]) + waitFor(ctx, t, 5*time.Minute, "EtcdCluster Available", etcdClusterAvailable(e2eNamespace, clusterName)) + waitFor(ctx, t, 1*time.Minute, "operator-issued TLS secrets", func(ctx context.Context) error { + for _, name := range []string{ + clusterName + "-server-tls", + clusterName + "-operator-client-tls", + clusterName + "-peer-tls", + } { + if err := secretExists(e2eNamespace, name, "tls.crt", "tls.key")(ctx); err != nil { + return err + } + } + return nil + }) + for _, svc := range []string{clusterName, clusterName + "-client"} { + if err := kube.Get(ctx, client.ObjectKey{Namespace: e2eNamespace, Name: svc}, &corev1.Service{}); err != nil { + t.Fatalf("expected Service %s/%s: %v", e2eNamespace, svc, err) + } + } + + // 03-datastore, 04-tenantcontrolplane — the Kamaji side. + applyFixture(ctx, t, fixtures[3]) + applyFixture(ctx, t, fixtures[4]) + waitFor(ctx, t, 10*time.Minute, "TenantControlPlane Ready", func(ctx context.Context) error { + status, err := unstructuredField(ctx, "kamaji.clastix.io/v1alpha1", "TenantControlPlane", + e2eNamespace, tcpName, "status", "kubernetesResources", "version", "status") + if err != nil { + return err + } + if status != "Ready" { + return fmt.Errorf("status.kubernetesResources.version.status=%q", status) + } + return nil + }) + + // Reach the tenant API server through a port-forward and prove it works. + tenant, stop := tenantClient(ctx, t) + defer stop() + + rt, err := rest.TransportFor(tenant) + if err != nil { + t.Fatalf("build tenant transport: %v", err) + } + code, body := httpGet(t, rt, tenant.Host+"/healthz") + if code != http.StatusOK || body != "ok" { + t.Fatalf("tenant /healthz: code=%d body=%q", code, body) + } + t.Log("tenant /healthz ok") + + tenantSet, err := kubernetes.NewForConfig(tenant) + if err != nil { + t.Fatalf("build tenant clientset: %v", err) + } + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: proofName, Namespace: "default"}, + Data: map[string]string{"written-by": "etcd-operator-e2e"}, + } + if _, err := tenantSet.CoreV1().ConfigMaps("default").Create(ctx, cm, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + t.Fatalf("create ConfigMap via tenant API: %v", err) + } + got, err := tenantSet.CoreV1().ConfigMaps("default").Get(ctx, proofName, metav1.GetOptions{}) + if err != nil || got.Data["written-by"] != "etcd-operator-e2e" { + t.Fatalf("read ConfigMap back via tenant API: %v (data=%v)", err, got.Data) + } + t.Log("tenant API ConfigMap roundtrip ok") + + // The ConfigMap must exist as a key in *our* etcd — that is the whole + // point of the DataStore wiring. + keys := etcdKeys(ctx, t) + if !strings.Contains(keys, proofName) { + t.Fatalf("etcd key dump does not contain %q;\nfirst 2000 bytes:\n%s", proofName, truncate(keys, 2000)) + } + t.Logf("found %q among etcd keys", proofName) + + // Teardown — reverse order, waiting where deletion is asynchronous. + deleteAndWait(ctx, t, "kamaji.clastix.io/v1alpha1", "TenantControlPlane", e2eNamespace, tcpName, 5*time.Minute) + deleteAndWait(ctx, t, "kamaji.clastix.io/v1alpha1", "DataStore", "", "kamaji-e2e", 2*time.Minute) + ec := &etcdv1alpha2.EtcdCluster{ObjectMeta: metav1.ObjectMeta{Namespace: e2eNamespace, Name: clusterName}} + if err := kube.Delete(ctx, ec); err != nil && !apierrors.IsNotFound(err) { + t.Fatalf("delete EtcdCluster: %v", err) + } + waitFor(ctx, t, 5*time.Minute, "etcd pods gone", func(ctx context.Context) error { + pods := &corev1.PodList{} + if err := kube.List(ctx, pods, client.InNamespace(e2eNamespace), + client.MatchingLabels{"etcd-operator.cozystack.io/cluster": clusterName}); err != nil { + return err + } + if n := len(pods.Items); n > 0 { + return fmt.Errorf("%d etcd pods still present", n) + } + return nil + }) +} + +// tenantClient builds a rest.Config for the tenant API server: admin +// kubeconfig from the Kamaji-generated Secret, transport rerouted through a +// port-forward to the apiserver pod (the kubeconfig points at a ClusterIP +// that is unreachable from outside the management cluster). +func tenantClient(ctx context.Context, t *testing.T) (*rest.Config, func()) { + t.Helper() + + secretName, err := unstructuredField(ctx, "kamaji.clastix.io/v1alpha1", "TenantControlPlane", + e2eNamespace, tcpName, "status", "kubeconfig", "admin", "secretName") + if err != nil { + t.Fatalf("read admin kubeconfig secret name: %v", err) + } + sec := &corev1.Secret{} + if err := kube.Get(ctx, client.ObjectKey{Namespace: e2eNamespace, Name: secretName}, sec); err != nil { + t.Fatalf("get kubeconfig secret %s: %v", secretName, err) + } + var kubeconfig []byte + for _, key := range []string{"super-admin.conf", "admin.conf", "super-admin.svc", "admin.svc"} { + if len(sec.Data[key]) > 0 { + kubeconfig = sec.Data[key] + break + } + } + if kubeconfig == nil { + t.Fatalf("no kubeconfig key in secret %s (keys: %v)", secretName, mapKeys(sec.Data)) + } + tenant, err := clientcmd.RESTConfigFromKubeConfig(kubeconfig) + if err != nil { + t.Fatalf("parse tenant kubeconfig: %v", err) + } + + localPort, stop := portForwardAPIServer(ctx, t) + + // Keep certificate verification honest: ServerName stays the original + // endpoint host (present in the apiserver cert SANs), only the dial + // target moves to the forwarded port. + host, _, err := net.SplitHostPort(strings.TrimPrefix(tenant.Host, "https://")) + if err != nil { + t.Fatalf("parse tenant host %q: %v", tenant.Host, err) + } + tenant.TLSClientConfig.ServerName = host + tenant.Host = fmt.Sprintf("https://127.0.0.1:%d", localPort) + return tenant, stop +} + +// portForwardAPIServer forwards a random local port to port 6443 of the +// first Ready tenant apiserver pod and returns the local port. +func portForwardAPIServer(ctx context.Context, t *testing.T) (uint16, func()) { + t.Helper() + + deploy, err := clientset.AppsV1().Deployments(e2eNamespace).Get(ctx, tcpName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("get tenant control plane deployment: %v", err) + } + selector := metav1.FormatLabelSelector(deploy.Spec.Selector) + pods, err := clientset.CoreV1().Pods(e2eNamespace).List(ctx, metav1.ListOptions{LabelSelector: selector}) + if err != nil || len(pods.Items) == 0 { + t.Fatalf("list tenant control plane pods (%s): %v", selector, err) + } + var podName string + for _, p := range pods.Items { + if p.Status.Phase == corev1.PodRunning { + podName = p.Name + break + } + } + if podName == "" { + t.Fatalf("no running tenant control plane pod among %d", len(pods.Items)) + } + + req := clientset.CoreV1().RESTClient().Post(). + Resource("pods").Namespace(e2eNamespace).Name(podName).SubResource("portforward") + transport, upgrader, err := spdy.RoundTripperFor(cfg) + if err != nil { + t.Fatalf("build spdy roundtripper: %v", err) + } + dialer := spdy.NewDialer(upgrader, &http.Client{Transport: transport}, "POST", &url.URL{ + Scheme: req.URL().Scheme, Host: req.URL().Host, Path: req.URL().Path, + }) + + stopCh := make(chan struct{}) + readyCh := make(chan struct{}) + fw, err := portforward.New(dialer, []string{"0:6443"}, stopCh, readyCh, nil, &testWriter{t}) + if err != nil { + t.Fatalf("create port-forward: %v", err) + } + errCh := make(chan error, 1) + go func() { errCh <- fw.ForwardPorts() }() + select { + case <-readyCh: + case err := <-errCh: + t.Fatalf("port-forward to %s: %v", podName, err) + case <-time.After(30 * time.Second): + t.Fatalf("port-forward to %s: timeout", podName) + } + ports, err := fw.GetPorts() + if err != nil || len(ports) == 0 { + t.Fatalf("get forwarded ports: %v", err) + } + t.Logf("port-forward 127.0.0.1:%d -> %s:6443", ports[0].Local, podName) + return ports[0].Local, func() { close(stopCh) } +} + +// etcdKeys dumps all keys from the etcd cluster by exec-ing etcdctl inside +// a member pod, authenticating with the server keypair (issued with +// clientAuth EKU for exactly this kind of loopback use). Member pods carry +// operator-generated names, so the pod is found via the cluster label. +func etcdKeys(ctx context.Context, t *testing.T) string { + t.Helper() + pods := &corev1.PodList{} + if err := kube.List(ctx, pods, client.InNamespace(e2eNamespace), + client.MatchingLabels{"etcd-operator.cozystack.io/cluster": clusterName}); err != nil || len(pods.Items) == 0 { + t.Fatalf("list etcd member pods: %v (found %d)", err, len(pods.Items)) + } + stdout, stderr, err := podExec(ctx, e2eNamespace, pods.Items[0].Name, "etcd", []string{ + "etcdctl", + "--endpoints=https://localhost:2379", + "--cert=/etc/etcd/tls/client/tls.crt", + "--key=/etc/etcd/tls/client/tls.key", + "--cacert=/etc/etcd/tls/client/ca.crt", + "get", "", "--prefix", "--keys-only", + }) + if err != nil { + t.Fatalf("etcdctl key dump: %v (stderr: %s)", err, stderr) + } + return stdout +} + +func podExec(ctx context.Context, namespace, pod, container string, command []string) (string, string, error) { + req := clientset.CoreV1().RESTClient().Post(). + Resource("pods").Namespace(namespace).Name(pod).SubResource("exec"). + VersionedParams(&corev1.PodExecOptions{ + Container: container, + Command: command, + Stdout: true, + Stderr: true, + }, scheme.ParameterCodec) + exec, err := remotecommand.NewSPDYExecutor(cfg, "POST", req.URL()) + if err != nil { + return "", "", err + } + var stdout, stderr bytes.Buffer + err = exec.StreamWithContext(ctx, remotecommand.StreamOptions{Stdout: &stdout, Stderr: &stderr}) + return stdout.String(), stderr.String(), err +} + +// deleteAndWait deletes an unstructured object and waits until it is gone. +func deleteAndWait(ctx context.Context, t *testing.T, apiVersion, kind, namespace, name string, timeout time.Duration) { + t.Helper() + obj := &unstructured.Unstructured{} + obj.SetAPIVersion(apiVersion) + obj.SetKind(kind) + obj.SetNamespace(namespace) + obj.SetName(name) + if err := kube.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) { + t.Fatalf("delete %s %s/%s: %v", kind, namespace, name, err) + } + waitFor(ctx, t, timeout, fmt.Sprintf("%s %s/%s deleted", kind, namespace, name), func(ctx context.Context) error { + err := kube.Get(ctx, client.ObjectKey{Namespace: namespace, Name: name}, obj.DeepCopy()) + if apierrors.IsNotFound(err) { + return nil + } + if err != nil { + return err + } + return fmt.Errorf("still present") + }) +} + +type testWriter struct{ t *testing.T } + +func (w *testWriter) Write(p []byte) (int, error) { + w.t.Logf("port-forward: %s", strings.TrimSpace(string(p))) + return len(p), nil +} + +func mapKeys(m map[string][]byte) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + return out +} + +func truncate(s string, n int) string { + if len(s) <= n { + return s + } + return s[:n] + "…" +} diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go new file mode 100644 index 00000000..0f38c300 --- /dev/null +++ b/test/e2e/suite_test.go @@ -0,0 +1,213 @@ +//go:build e2e + +// Package e2e holds integration tests that run against a real cluster +// (kind in CI) with the operator, cert-manager and Kamaji installed. +// hack/e2e.sh provisions all of that and then runs `make test-e2e`. +package e2e + +import ( + "bufio" + "bytes" + "context" + "fmt" + "io" + "net/http" + "os" + "path/filepath" + "sort" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + apimeta "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + utilyaml "k8s.io/apimachinery/pkg/util/yaml" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + "sigs.k8s.io/controller-runtime/pkg/client" + + etcdv1alpha2 "github.com/cozystack/etcd-operator/api/v1alpha2" +) + +const fieldOwner = client.FieldOwner("etcd-operator-e2e") + +var ( + cfg *rest.Config + kube client.Client + clientset *kubernetes.Clientset +) + +func TestMain(m *testing.M) { + rules := clientcmd.NewDefaultClientConfigLoadingRules() + c, err := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(rules, nil).ClientConfig() + if err != nil { + fmt.Fprintf(os.Stderr, "e2e: cannot load kubeconfig: %v\n", err) + os.Exit(1) + } + cfg = c + + scheme := runtime.NewScheme() + if err := corev1.AddToScheme(scheme); err != nil { + fmt.Fprintf(os.Stderr, "e2e: %v\n", err) + os.Exit(1) + } + if err := etcdv1alpha2.AddToScheme(scheme); err != nil { + fmt.Fprintf(os.Stderr, "e2e: %v\n", err) + os.Exit(1) + } + kube, err = client.New(cfg, client.Options{Scheme: scheme}) + if err != nil { + fmt.Fprintf(os.Stderr, "e2e: cannot build client: %v\n", err) + os.Exit(1) + } + clientset, err = kubernetes.NewForConfig(cfg) + if err != nil { + fmt.Fprintf(os.Stderr, "e2e: cannot build clientset: %v\n", err) + os.Exit(1) + } + + os.Exit(m.Run()) +} + +// applyFixture server-side-applies every document of a testdata YAML file. +// Unknown-to-the-scheme kinds (cert-manager, Kamaji) go through as +// unstructured, so only the CRDs have to exist on the cluster. +func applyFixture(ctx context.Context, t *testing.T, path string) []*unstructured.Unstructured { + t.Helper() + raw, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read fixture %s: %v", path, err) + } + var applied []*unstructured.Unstructured + dec := utilyaml.NewYAMLReader(bufio.NewReader(bytes.NewReader(raw))) + for { + doc, err := dec.Read() + if err == io.EOF { + break + } + if err != nil { + t.Fatalf("parse fixture %s: %v", path, err) + } + obj := &unstructured.Unstructured{} + if err := utilyaml.Unmarshal(doc, &obj.Object); err != nil { + t.Fatalf("decode fixture %s: %v", path, err) + } + if len(obj.Object) == 0 { // comment-only document + continue + } + if err := kube.Patch(ctx, obj, client.Apply, fieldOwner, client.ForceOwnership); err != nil { + t.Fatalf("apply %s %s/%s from %s: %v", + obj.GetKind(), obj.GetNamespace(), obj.GetName(), path, err) + } + t.Logf("applied %s %s/%s", obj.GetKind(), obj.GetNamespace(), obj.GetName()) + applied = append(applied, obj) + } + return applied +} + +// fixturePaths returns testdata/*.yaml sorted by name; the two-digit +// prefixes encode apply order. +func fixturePaths(t *testing.T) []string { + t.Helper() + paths, err := filepath.Glob(filepath.Join("testdata", "*.yaml")) + if err != nil || len(paths) == 0 { + t.Fatalf("no fixtures found: %v", err) + } + sort.Strings(paths) + return paths +} + +// waitFor polls fn until it returns nil or the timeout elapses. fn's last +// error is included in the failure to make timeouts diagnosable. +func waitFor(ctx context.Context, t *testing.T, timeout time.Duration, desc string, fn func(context.Context) error) { + t.Helper() + t.Logf("waiting up to %s for %s", timeout, desc) + deadline := time.Now().Add(timeout) + var lastErr error + for { + cctx, cancel := context.WithTimeout(ctx, 30*time.Second) + lastErr = fn(cctx) + cancel() + if lastErr == nil { + return + } + if time.Now().After(deadline) { + t.Fatalf("timed out waiting for %s: %v", desc, lastErr) + } + select { + case <-ctx.Done(): + t.Fatalf("context cancelled waiting for %s: %v (last: %v)", desc, ctx.Err(), lastErr) + case <-time.After(5 * time.Second): + } + } +} + +// secretExists fails the wait round unless the Secret exists and carries +// all the listed keys with non-empty values. +func secretExists(namespace, name string, keys ...string) func(context.Context) error { + return func(ctx context.Context) error { + sec := &corev1.Secret{} + if err := kube.Get(ctx, client.ObjectKey{Namespace: namespace, Name: name}, sec); err != nil { + return err + } + for _, k := range keys { + if len(sec.Data[k]) == 0 { + return fmt.Errorf("secret %s/%s missing key %q", namespace, name, k) + } + } + return nil + } +} + +// etcdClusterAvailable checks the typed EtcdCluster Available condition. +func etcdClusterAvailable(namespace, name string) func(context.Context) error { + return func(ctx context.Context) error { + ec := &etcdv1alpha2.EtcdCluster{} + if err := kube.Get(ctx, client.ObjectKey{Namespace: namespace, Name: name}, ec); err != nil { + return err + } + cond := apimeta.FindStatusCondition(ec.Status.Conditions, etcdv1alpha2.ClusterAvailable) + if cond == nil { + return fmt.Errorf("condition %s not reported yet", etcdv1alpha2.ClusterAvailable) + } + if cond.Status != "True" { + return fmt.Errorf("condition %s=%s (%s): %s", cond.Type, cond.Status, cond.Reason, cond.Message) + } + return nil + } +} + +// unstructuredField reads a dotted path from an unstructured object fetched +// live from the cluster. +func unstructuredField(ctx context.Context, apiVersion, kind, namespace, name string, fields ...string) (string, error) { + obj := &unstructured.Unstructured{} + obj.SetAPIVersion(apiVersion) + obj.SetKind(kind) + if err := kube.Get(ctx, client.ObjectKey{Namespace: namespace, Name: name}, obj); err != nil { + return "", err + } + val, found, err := unstructured.NestedString(obj.Object, fields...) + if err != nil || !found { + return "", fmt.Errorf("field %v not found on %s %s/%s: %v", fields, kind, namespace, name, err) + } + return val, nil +} + +// httpGet performs a plain GET against url with the given TLS-configured +// transport and returns the body. +func httpGet(t *testing.T, rt http.RoundTripper, url string) (int, string) { + t.Helper() + httpClient := &http.Client{Transport: rt, Timeout: 15 * time.Second} + resp, err := httpClient.Get(url) + if err != nil { + t.Fatalf("GET %s: %v", url, err) + } + defer resp.Body.Close() + body, err := io.ReadAll(resp.Body) + if err != nil { + t.Fatalf("GET %s: read body: %v", url, err) + } + return resp.StatusCode, string(body) +} diff --git a/test/e2e/testdata/00-namespace.yaml b/test/e2e/testdata/00-namespace.yaml new file mode 100644 index 00000000..0349908b --- /dev/null +++ b/test/e2e/testdata/00-namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: kamaji-e2e diff --git a/test/e2e/testdata/01-pki.yaml b/test/e2e/testdata/01-pki.yaml new file mode 100644 index 00000000..4bfc7d8c --- /dev/null +++ b/test/e2e/testdata/01-pki.yaml @@ -0,0 +1,59 @@ +# PKI for the Kamaji DataStore scenario. One CA signs everything: the +# operator's cert-manager-mode certificates (server, operator-client, peer) +# and Kamaji's own client certificate. Kamaji additionally receives the CA +# *private key* through the DataStore — its etcd driver signs a per-TCP +# apiserver-etcd-client certificate with it, so etcd (which trusts this CA +# for client auth) accepts tenant API servers without further wiring. +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: selfsigned + namespace: kamaji-e2e +spec: + selfSigned: {} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: etcd-ca + namespace: kamaji-e2e +spec: + isCA: true + commonName: etcd-ca + secretName: etcd-ca-tls + duration: 87600h + usages: + - signing + - key encipherment + - cert sign + issuerRef: + name: selfsigned + kind: Issuer +--- +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: etcd-ca-issuer + namespace: kamaji-e2e +spec: + ca: + secretName: etcd-ca-tls +--- +# Client certificate Kamaji presents on its own dials (health checks, +# datastore setup). CN=root mirrors Cozystack's packages/extra/etcd chart. +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: kamaji-etcd-client + namespace: kamaji-e2e +spec: + commonName: root + secretName: kamaji-etcd-client-tls + duration: 87600h + usages: + - signing + - key encipherment + - client auth + issuerRef: + name: etcd-ca-issuer + kind: Issuer diff --git a/test/e2e/testdata/02-etcdcluster.yaml b/test/e2e/testdata/02-etcdcluster.yaml new file mode 100644 index 00000000..1cc3233e --- /dev/null +++ b/test/e2e/testdata/02-etcdcluster.yaml @@ -0,0 +1,24 @@ +# Full-mTLS EtcdCluster in cert-manager mode. operatorClientIssuerRef turns +# on etcd client-cert verification (--client-cert-auth) against the issuing +# CA, which is the same CA the Kamaji DataStore signs tenant clients with. +apiVersion: etcd-operator.cozystack.io/v1alpha2 +kind: EtcdCluster +metadata: + name: etcd + namespace: kamaji-e2e +spec: + replicas: 3 + version: "3.6.11" + storage: + size: 1Gi + tls: + peer: + certManager: + issuerRef: + name: etcd-ca-issuer + client: + certManager: + serverIssuerRef: + name: etcd-ca-issuer + operatorClientIssuerRef: + name: etcd-ca-issuer diff --git a/test/e2e/testdata/03-datastore.yaml b/test/e2e/testdata/03-datastore.yaml new file mode 100644 index 00000000..f5f6a714 --- /dev/null +++ b/test/e2e/testdata/03-datastore.yaml @@ -0,0 +1,35 @@ +# Kamaji DataStore backed by the operator-managed cluster. Field layout is +# the same as Cozystack's packages/extra/etcd/templates/datastore.yaml: the +# CA entry carries both halves of the CA keypair (Kamaji signs per-TCP etcd +# client certs with it), the clientCertificate is Kamaji's own identity. +apiVersion: kamaji.clastix.io/v1alpha1 +kind: DataStore +metadata: + name: kamaji-e2e +spec: + driver: etcd + endpoints: + - etcd-client.kamaji-e2e.svc:2379 + tlsConfig: + certificateAuthority: + certificate: + secretReference: + name: etcd-ca-tls + namespace: kamaji-e2e + keyPath: tls.crt + privateKey: + secretReference: + name: etcd-ca-tls + namespace: kamaji-e2e + keyPath: tls.key + clientCertificate: + certificate: + secretReference: + name: kamaji-etcd-client-tls + namespace: kamaji-e2e + keyPath: tls.crt + privateKey: + secretReference: + name: kamaji-etcd-client-tls + namespace: kamaji-e2e + keyPath: tls.key diff --git a/test/e2e/testdata/04-tenantcontrolplane.yaml b/test/e2e/testdata/04-tenantcontrolplane.yaml new file mode 100644 index 00000000..635256e4 --- /dev/null +++ b/test/e2e/testdata/04-tenantcontrolplane.yaml @@ -0,0 +1,22 @@ +# Minimal tenant control plane: one apiserver replica, ClusterIP service, +# no konnectivity (the test attaches no worker nodes). The version must stay +# within the skew supported by the pinned Kamaji release (hack/e2e.sh). +apiVersion: kamaji.clastix.io/v1alpha1 +kind: TenantControlPlane +metadata: + name: tenant1 + namespace: kamaji-e2e +spec: + dataStore: kamaji-e2e + controlPlane: + deployment: + replicas: 1 + service: + serviceType: ClusterIP + kubernetes: + version: v1.30.2 + kubelet: + cgroupfs: systemd + networkProfile: + port: 6443 + addons: {}