Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions .github/workflows/e2e.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: E2E

on:
# No PR branch filter: PRs in this repo target main as well as integration
# branches (e.g. v1), and the suite must gate all of them. A filter naming
# a branch PRs never target (this said `master`, which does not exist) means
# the workflow never runs at all.
#
# paths-ignore: a kind+cert-manager+Kamaji provisioning run costs ~30-45
# minutes; skip it for PRs that touch nothing the suite exercises
# (docs-only changes). Everything else — Go code, manifests, the harness,
# the workflows themselves — still gates.
pull_request:
paths-ignore:
- '**.md'
- 'docs/**'
push:
tags: [ "v*" ]
workflow_dispatch:

concurrency:
group: e2e-${{ github.ref }}
cancel-in-progress: true

jobs:
kamaji-datastore:
runs-on: ubuntu-latest
timeout-minutes: 45
steps:
- uses: actions/checkout@v4

- uses: actions/setup-go@v5
with:
go-version-file: go.mod
cache: true

- name: e2e (kind + cert-manager + Kamaji)
# Provisions an ephemeral kind cluster, installs cert-manager and
# Kamaji at the versions pinned in hack/e2e.sh, builds the operator
# from this checkout and runs test/e2e — the Kamaji DataStore
# consumer scenario.
#
# On failure, hack/e2e.sh's EXIT trap dumps the cluster state
# (CRs, pods, operator and Kamaji logs) into this step's output
# BEFORE tearing the kind cluster down. Don't add a separate
# dump step here: by the time it would run, the trap has already
# deleted the cluster and every kubectl call would come up empty.
run: make e2e
8 changes: 8 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,14 @@ vet: ## Run go vet against code.
test: manifests generate fmt vet envtest ## Run tests.
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./... -coverprofile cover.out

.PHONY: test-e2e
test-e2e: ## Run the e2e suite against the current kubeconfig context (expects cert-manager, Kamaji and the operator installed; see hack/e2e.sh).
go test -tags e2e -count=1 ./test/e2e/ -v -timeout 30m

.PHONY: e2e
e2e: ## Provision a kind cluster with cert-manager and Kamaji, deploy the operator, run the e2e suite. KEEP_CLUSTER=1 keeps the cluster for debugging.
hack/e2e.sh

##@ Build

.PHONY: build
Expand Down
175 changes: 175 additions & 0 deletions cmd/kubectl-etcd/helpers_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
package main

import (
"errors"
"fmt"
"strings"
"testing"
"time"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
)

// ── A failed/never-ready port-forward must not hang ─────────────────────────

func TestAwaitForward_Ready(t *testing.T) {
ready := make(chan struct{}, 1)
close(ready)
if err := awaitForward(ready, make(chan error, 1), make(chan struct{}, 1), time.Second); err != nil {
t.Fatalf("ready forward should succeed, got %v", err)
}
}

func TestAwaitForward_ErrorBeforeReady(t *testing.T) {
// ForwardPorts returns an error without ever closing readyChan — the old
// code blocked on <-readyChan forever. awaitForward must return the error.
forwardErr := make(chan error, 1)
forwardErr <- fmt.Errorf("dial tcp: connection refused")
err := awaitForward(make(chan struct{}), forwardErr, make(chan struct{}, 1), time.Second)
if err == nil {
t.Fatal("a forward failure must return an error, not hang or succeed")
}
if !strings.Contains(err.Error(), "connection refused") {
t.Errorf("error should wrap the forward failure, got %v", err)
}
}

func TestAwaitForward_Timeout(t *testing.T) {
stop := make(chan struct{}, 1)
// Nothing ever signals ready and no error arrives → must time out, not hang.
err := awaitForward(make(chan struct{}), make(chan error, 1), stop, 10*time.Millisecond)
if err == nil {
t.Fatal("a never-ready forward must time out, not hang")
}
select {
case <-stop:
default:
t.Error("timeout must close stopChan to tear the forwarder down")
}
}

// ── TLS secret discovery on the etcd container ───────────────────────────────

// etcdTLSContainer builds an etcd container + Pod shaped like the operator's
// buildPod output: the client-TLS Secret volume mounted at
// /etc/etcd/tls/client and the given etcd args.
func etcdTLSContainer(args ...string) (*corev1.Pod, corev1.Container) {
c := corev1.Container{
Name: "etcd",
Command: append([]string{"etcd"}, args...),
VolumeMounts: []corev1.VolumeMount{{Name: "tls-client", MountPath: "/etc/etcd/tls/client"}},
}
pod := &corev1.Pod{
Spec: corev1.PodSpec{
Containers: []corev1.Container{c},
Volumes: []corev1.Volume{{
Name: "tls-client",
VolumeSource: corev1.VolumeSource{Secret: &corev1.SecretVolumeSource{SecretName: "etcd-client-tls"}},
}},
},
}
return pod, c
}

func TestFindSecretNameForTLS_ServerTLSOnly(t *testing.T) {
pod, c := etcdTLSContainer("--cert-file=/etc/etcd/tls/client/tls.crt")
name, mTLS, err := findSecretNameForTLS(pod, c)
if err != nil {
t.Fatal(err)
}
if name != "etcd-client-tls" {
t.Errorf("secret name = %q, want %q", name, "etcd-client-tls")
}
if mTLS {
t.Error("no --client-cert-auth/--trusted-ca-file present, mTLS must be false")
}
}

// mTLS is reported when the server demands a client certificate — via either
// the --trusted-ca-file or the --client-cert-auth flag.
func TestFindSecretNameForTLS_MTLSDetection(t *testing.T) {
for _, flag := range []string{"--trusted-ca-file=/etc/etcd/tls/client/ca.crt", "--client-cert-auth=true", "--client-cert-auth"} {
pod, c := etcdTLSContainer("--cert-file=/etc/etcd/tls/client/tls.crt", flag)
_, mTLS, err := findSecretNameForTLS(pod, c)
if err != nil {
t.Fatalf("%s: %v", flag, err)
}
if !mTLS {
t.Errorf("%s must flag mTLS", flag)
}
}
}

func TestFindSecretNameForTLS_NoCertFile(t *testing.T) {
c := corev1.Container{Name: "etcd", Args: []string{"--listen-client-urls=http://0.0.0.0:2379"}}
pod := &corev1.Pod{Spec: corev1.PodSpec{Containers: []corev1.Container{c}}}
_, _, err := findSecretNameForTLS(pod, c)
if !errors.Is(err, errNoCertFile) {
t.Errorf("expected errNoCertFile sentinel (plaintext cluster), got %v", err)
}
}

func TestFindSecretNameForTLS_MountNotFound(t *testing.T) {
c := corev1.Container{Name: "etcd", Args: []string{"--cert-file=/nowhere/tls.crt"}}
pod := &corev1.Pod{Spec: corev1.PodSpec{Containers: []corev1.Container{c}}}
if _, _, err := findSecretNameForTLS(pod, c); err == nil {
t.Error("expected an error when no volume backs the cert path")
}
}

// ── Credential loading from a kubernetes.io/basic-auth Secret ───────────────

func TestLoadCredentials_Happy(t *testing.T) {
cs := fake.NewSimpleClientset(&corev1.Secret{
ObjectMeta: metav1.ObjectMeta{Name: "creds", Namespace: "ns1"},
Data: map[string][]byte{
corev1.BasicAuthUsernameKey: []byte("root"),
corev1.BasicAuthPasswordKey: []byte("s3cret"),
},
})
u, p, err := loadCredentials(cs, "ns1", "creds")
if err != nil {
t.Fatal(err)
}
if u != "root" || p != "s3cret" {
t.Errorf("got %q/%q, want root/s3cret", u, p)
}
}

func TestLoadCredentials_DefaultsUsernameToRoot(t *testing.T) {
cs := fake.NewSimpleClientset(&corev1.Secret{
ObjectMeta: metav1.ObjectMeta{Name: "creds", Namespace: "ns1"},
Data: map[string][]byte{corev1.BasicAuthPasswordKey: []byte("p")},
})
u, _, err := loadCredentials(cs, "ns1", "creds")
if err != nil {
t.Fatal(err)
}
if u != "root" {
t.Errorf("username default = %q, want root", u)
}
}

func TestLoadCredentials_MissingPassword(t *testing.T) {
cs := fake.NewSimpleClientset(&corev1.Secret{
ObjectMeta: metav1.ObjectMeta{Name: "creds", Namespace: "ns1"},
Data: map[string][]byte{corev1.BasicAuthUsernameKey: []byte("root")},
})
if _, _, err := loadCredentials(cs, "ns1", "creds"); err == nil {
t.Error("expected an error when the password key is missing/empty")
}
}

func TestLoadCredentials_NamespacedRef(t *testing.T) {
cs := fake.NewSimpleClientset(&corev1.Secret{
ObjectMeta: metav1.ObjectMeta{Name: "creds", Namespace: "other"},
Data: map[string][]byte{corev1.BasicAuthPasswordKey: []byte("p")},
})
// Default namespace is ns1, but the "other/creds" ref must override it.
_, p, err := loadCredentials(cs, "ns1", "other/creds")
if err != nil || p != "p" {
t.Errorf("namespace/name ref not honored: p=%q err=%v", p, err)
}
}
5 changes: 4 additions & 1 deletion config/default/manager_auth_proxy_patch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@ spec:
capabilities:
drop:
- "ALL"
image: gcr.io/kubebuilder/kube-rbac-proxy:v0.13.1
# gcr.io/kubebuilder/kube-rbac-proxy is gone (the gcr.io/kubebuilder
# registry shut down in early 2025); upstream now releases under its
# own GitHub org at ghcr.io/kube-rbac-proxy/kube-rbac-proxy.
image: ghcr.io/kube-rbac-proxy/kube-rbac-proxy:v0.22.0
args:
- "--secure-listen-address=0.0.0.0:8443"
- "--upstream=http://127.0.0.1:8080/"
Expand Down
13 changes: 12 additions & 1 deletion controllers/etcdcluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,8 +452,19 @@ func (r *EtcdClusterReconciler) bootstrap(
// Pod label on the first reconcile of the seed Pod, so the PDB
// selects it from creation rather than only after MemberList runs
// a few reconciles later.
//
// Written as a merge patch, not an Update: the member controller
// reconciles the seed the moment it is created and its own
// Status().Update races with this write — an optimistic-locked
// Update here loses that race with a conflict, and a cache-backed
// re-Get cannot recover (the informer may not even have the
// object yet). This stamp happens only in the create branch —
// losing it would leave the seed permanently IsVoter=false, and
// both learner memberID discovery and promotion filter their dial
// endpoints to voters, wedging every later scale-up.
stampBase := seed.DeepCopy()
seed.Status.IsVoter = true
if err := r.Status().Update(ctx, seed); err != nil {
if err := r.Status().Patch(ctx, seed, client.MergeFrom(stampBase)); err != nil {
return ctrl.Result{}, err
}
} else if !metav1.IsControlledBy(seed, cluster) {
Expand Down
63 changes: 63 additions & 0 deletions controllers/etcdcluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ import (
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
"sigs.k8s.io/controller-runtime/pkg/client/interceptor"

lll "github.com/cozystack/etcd-operator/api/v1alpha2"
)
Expand Down Expand Up @@ -110,6 +112,67 @@ func TestBootstrap_CreatesSingleSeedMember(t *testing.T) {
if !seed.Spec.Bootstrap {
t.Fatalf("seed member must be marked Bootstrap=true")
}
// The seed is pre-stamped IsVoter=true at create time (it forms the
// cluster on its own — never a learner). Losing the stamp wedges every
// later scale-up: learner memberID discovery and promotion dial voters
// only.
if !seed.Status.IsVoter {
t.Fatalf("seed Status.IsVoter must be pre-stamped true at create; got false")
}
}

// TestBootstrap_SeedIsVoterStampSurvivesConcurrentStatusWriter pins the
// merge-patch choice in the seed IsVoter pre-stamp
// (etcdcluster_controller.go bootstrap). The member controller starts
// reconciling the seed the moment Create returns, and its own
// Status().Update bumps the resourceVersion concurrently with this stamp.
// An optimistic-locked Status().Update here loses that race with a
// Conflict, and — because the stamp lives only in the create branch — the
// seed would stay IsVoter=false forever. The interceptor models the
// concurrent writer by failing every optimistic-locked EtcdMember status
// Update with a Conflict; the merge patch carries no resourceVersion and
// must land regardless. This test fails against the previous
// Status().Update implementation.
func TestBootstrap_SeedIsVoterStampSurvivesConcurrentStatusWriter(t *testing.T) {
ctx := context.Background()
cluster := &lll.EtcdCluster{
ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "ns"},
Spec: lll.EtcdClusterSpec{
Replicas: ptrInt32(3),
Version: "3.5.17",
Storage: lll.StorageSpec{Size: quickQty(t, "1Gi")},
},
}
s := testScheme(t)
c := fake.NewClientBuilder().
WithScheme(s).
WithObjects(cluster).
WithStatusSubresource(&lll.EtcdCluster{}, &lll.EtcdMember{}, &lll.EtcdSnapshot{}).
WithInterceptorFuncs(interceptor.Funcs{
SubResourceUpdate: func(ctx context.Context, cl client.Client, sub string, obj client.Object, opts ...client.SubResourceUpdateOption) error {
if _, isMember := obj.(*lll.EtcdMember); isMember && sub == "status" {
return apierrors.NewConflict(
schema.GroupResource{Group: lll.GroupVersion.Group, Resource: "etcdmembers"},
obj.GetName(), errors.New("simulated concurrent status writer"))
}
return cl.SubResource(sub).Update(ctx, obj, opts...)
},
}).
Build()
r := &EtcdClusterReconciler{Client: c, Scheme: s, EtcdClientFactory: factoryReturning(newFakeEtcd(0xdeadbeef))}

reconcileUntilStable(t, r, c, "test", "ns", 8)

members := &lll.EtcdMemberList{}
if err := c.List(ctx, members, client.InNamespace("ns")); err != nil {
t.Fatalf("List: %v", err)
}
if len(members.Items) != 1 {
t.Fatalf("expected one seed member; got %d", len(members.Items))
}
if !members.Items[0].Status.IsVoter {
t.Fatal("seed IsVoter stamp lost under a concurrent status writer; the stamp must be a merge patch, not an optimistic-locked Update")
}
}

// TestObservedSpec_LocksMidFlight pins the locking pattern's correctness
Expand Down
Loading
Loading