From 39ffc19f43ae2fd51393af8e8ada18a839065d0f Mon Sep 17 00:00:00 2001 From: Cliff Schomburg <7424213+cssjr@users.noreply.github.com> Date: Wed, 27 May 2026 11:52:42 -0700 Subject: [PATCH 1/7] feat(etcd): add EndpointSlice self-registration to ensure-dns init container Under high HCP density (~2,500 namespaces), the management cluster's kube-controller-manager EndpointSlice informer cache goes stale, delaying etcd-discovery headless Service DNS records by minutes to hours. This causes etcd pods to block in the ensure-dns init container, preventing cluster creation and triggering E2E test timeouts. Each etcd pod now self-registers its IP into a custom EndpointSlice (managed-by: control-plane-operator) before polling DNS. The standard EndpointSlice controller ignores slices with a different managed-by label, so there is no conflict. CoreDNS picks up the custom slice via its watch within ~100ms, making DNS resolution near-instant regardless of controller informer health. A new dedicated "etcd" service account replaces the conditional "etcd-defrag-controller" SA, with RBAC granting EndpointSlice and pod get permissions. The defrag Role/RoleBinding remain HA-only. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../hostedcontrolplane/manifests/etcd.go | 27 ++++ ...Components_etcd_controlplanecomponent.yaml | 9 ++ ...aneComponents_etcd_self_register_role.yaml | 28 ++++ ...onents_etcd_self_register_rolebinding.yaml | 21 +++ ...olPlaneComponents_etcd_serviceaccount.yaml | 15 ++ ...ntrolPlaneComponents_etcd_statefulset.yaml | 8 +- ...Components_etcd_controlplanecomponent.yaml | 9 ++ ...aneComponents_etcd_self_register_role.yaml | 28 ++++ ...onents_etcd_self_register_rolebinding.yaml | 21 +++ ...olPlaneComponents_etcd_serviceaccount.yaml | 15 ++ ...ntrolPlaneComponents_etcd_statefulset.yaml | 8 +- ...Components_etcd_controlplanecomponent.yaml | 9 ++ ...aneComponents_etcd_self_register_role.yaml | 28 ++++ ...onents_etcd_self_register_rolebinding.yaml | 21 +++ ...olPlaneComponents_etcd_serviceaccount.yaml | 15 ++ ...ntrolPlaneComponents_etcd_statefulset.yaml | 8 +- ...Components_etcd_controlplanecomponent.yaml | 9 ++ ...aneComponents_etcd_self_register_role.yaml | 28 ++++ ...onents_etcd_self_register_rolebinding.yaml | 21 +++ ...olPlaneComponents_etcd_serviceaccount.yaml | 15 ++ ...ntrolPlaneComponents_etcd_statefulset.yaml | 8 +- ...Components_etcd_controlplanecomponent.yaml | 9 ++ ...aneComponents_etcd_self_register_role.yaml | 28 ++++ ...onents_etcd_self_register_rolebinding.yaml | 21 +++ ...olPlaneComponents_etcd_serviceaccount.yaml | 15 ++ ...ntrolPlaneComponents_etcd_statefulset.yaml | 8 +- .../v2/assets/etcd/defrag-rolebinding.yaml | 2 +- .../assets/etcd/etcd-self-register-role.yaml | 19 +++ .../etcd/etcd-self-register-rolebinding.yaml | 11 ++ ...eaccount.yaml => etcd-serviceaccount.yaml} | 2 +- .../v2/assets/etcd/statefulset.yaml | 7 +- .../hostedcontrolplane/v2/etcd/component.go | 9 +- .../hostedcontrolplane/v2/etcd/statefulset.go | 3 +- dnsresolver/cmd.go | 129 ++++++++++++++++++ dnsresolver/cmd_test.go | 55 ++++++++ 35 files changed, 658 insertions(+), 11 deletions(-) create mode 100644 control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml create mode 100644 control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml create mode 100644 control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml create mode 100644 control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml create mode 100644 control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml create mode 100644 control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml create mode 100644 control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml create mode 100644 control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml create mode 100644 control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml create mode 100644 control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml create mode 100644 control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml create mode 100644 control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml create mode 100644 control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml create mode 100644 control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml create mode 100644 control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml create mode 100644 control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/etcd-self-register-role.yaml create mode 100644 control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/etcd-self-register-rolebinding.yaml rename control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/{defrag-serviceaccount.yaml => etcd-serviceaccount.yaml} (59%) create mode 100644 dnsresolver/cmd_test.go diff --git a/control-plane-operator/controllers/hostedcontrolplane/manifests/etcd.go b/control-plane-operator/controllers/hostedcontrolplane/manifests/etcd.go index 8d70f9c838f..6901f2d4555 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/manifests/etcd.go +++ b/control-plane-operator/controllers/hostedcontrolplane/manifests/etcd.go @@ -91,6 +91,33 @@ func EtcdDefragControllerServiceAccount(ns string) *corev1.ServiceAccount { } } +func EtcdServiceAccount(ns string) *corev1.ServiceAccount { + return &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: "etcd", + Namespace: ns, + }, + } +} + +func EtcdSelfRegisterRole(ns string) *rbacv1.Role { + return &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: "etcd-self-register", + Namespace: ns, + }, + } +} + +func EtcdSelfRegisterRoleBinding(ns string) *rbacv1.RoleBinding { + return &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: "etcd-self-register", + Namespace: ns, + }, + } +} + func EtcdBackupServiceAccount(hcpNamespace string) *corev1.ServiceAccount { return &corev1.ServiceAccount{ ObjectMeta: metav1.ObjectMeta{ diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml index 44d35df32f1..ac44d424b8f 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml @@ -22,6 +22,15 @@ status: - group: "" kind: Service name: etcd-discovery + - group: rbac.authorization.k8s.io + kind: Role + name: etcd-self-register + - group: rbac.authorization.k8s.io + kind: RoleBinding + name: etcd-self-register + - group: "" + kind: ServiceAccount + name: etcd - group: policy kind: PodDisruptionBudget name: etcd diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml new file mode 100644 index 00000000000..3e97f78628b --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml @@ -0,0 +1,28 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: etcd-self-register + namespace: hcp-namespace + ownerReferences: + - apiVersion: hypershift.openshift.io/v1beta1 + blockOwnerDeletion: true + controller: true + kind: HostedControlPlane + name: hcp + uid: "" + resourceVersion: "1" +rules: +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - get + - create + - update +- apiGroups: + - "" + resources: + - pods + verbs: + - get diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml new file mode 100644 index 00000000000..f8cada01fe4 --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml @@ -0,0 +1,21 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: etcd-self-register + namespace: hcp-namespace + ownerReferences: + - apiVersion: hypershift.openshift.io/v1beta1 + blockOwnerDeletion: true + controller: true + kind: HostedControlPlane + name: hcp + uid: "" + resourceVersion: "1" +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: etcd-self-register +subjects: +- kind: ServiceAccount + name: etcd + namespace: hcp-namespace diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml new file mode 100644 index 00000000000..00f053fdf28 --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +imagePullSecrets: +- name: pull-secret +kind: ServiceAccount +metadata: + name: etcd + namespace: hcp-namespace + ownerReferences: + - apiVersion: hypershift.openshift.io/v1beta1 + blockOwnerDeletion: true + controller: true + kind: HostedControlPlane + name: hcp + uid: "" + resourceVersion: "1" diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml index ba0322c7603..46353bf2b68 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/AROSwift/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml @@ -270,7 +270,7 @@ spec: initContainers: - args: - -c - - exec control-plane-operator resolve-dns ${HOSTNAME}.etcd-discovery.${NAMESPACE}.svc + - exec control-plane-operator resolve-dns --self-register ${HOSTNAME}.etcd-discovery.${NAMESPACE}.svc command: - /bin/bash env: @@ -279,6 +279,11 @@ spec: fieldRef: apiVersion: v1 fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP image: controlplane-operator imagePullPolicy: IfNotPresent name: ensure-dns @@ -351,6 +356,7 @@ spec: priorityClassName: hypershift-etcd restartPolicy: Always schedulerName: default-scheduler + serviceAccountName: etcd tolerations: - effect: NoSchedule key: hypershift.openshift.io/control-plane diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml index 44d35df32f1..ac44d424b8f 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml @@ -22,6 +22,15 @@ status: - group: "" kind: Service name: etcd-discovery + - group: rbac.authorization.k8s.io + kind: Role + name: etcd-self-register + - group: rbac.authorization.k8s.io + kind: RoleBinding + name: etcd-self-register + - group: "" + kind: ServiceAccount + name: etcd - group: policy kind: PodDisruptionBudget name: etcd diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml new file mode 100644 index 00000000000..3e97f78628b --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml @@ -0,0 +1,28 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: etcd-self-register + namespace: hcp-namespace + ownerReferences: + - apiVersion: hypershift.openshift.io/v1beta1 + blockOwnerDeletion: true + controller: true + kind: HostedControlPlane + name: hcp + uid: "" + resourceVersion: "1" +rules: +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - get + - create + - update +- apiGroups: + - "" + resources: + - pods + verbs: + - get diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml new file mode 100644 index 00000000000..f8cada01fe4 --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml @@ -0,0 +1,21 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: etcd-self-register + namespace: hcp-namespace + ownerReferences: + - apiVersion: hypershift.openshift.io/v1beta1 + blockOwnerDeletion: true + controller: true + kind: HostedControlPlane + name: hcp + uid: "" + resourceVersion: "1" +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: etcd-self-register +subjects: +- kind: ServiceAccount + name: etcd + namespace: hcp-namespace diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml new file mode 100644 index 00000000000..00f053fdf28 --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +imagePullSecrets: +- name: pull-secret +kind: ServiceAccount +metadata: + name: etcd + namespace: hcp-namespace + ownerReferences: + - apiVersion: hypershift.openshift.io/v1beta1 + blockOwnerDeletion: true + controller: true + kind: HostedControlPlane + name: hcp + uid: "" + resourceVersion: "1" diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml index 902857d72a0..7d2d81bee1e 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/GCP/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml @@ -285,7 +285,7 @@ spec: initContainers: - args: - -c - - exec control-plane-operator resolve-dns ${HOSTNAME}.etcd-discovery.${NAMESPACE}.svc + - exec control-plane-operator resolve-dns --self-register ${HOSTNAME}.etcd-discovery.${NAMESPACE}.svc command: - /bin/bash env: @@ -294,6 +294,11 @@ spec: fieldRef: apiVersion: v1 fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP image: controlplane-operator imagePullPolicy: IfNotPresent name: ensure-dns @@ -378,6 +383,7 @@ spec: priorityClassName: hypershift-etcd restartPolicy: Always schedulerName: default-scheduler + serviceAccountName: etcd tolerations: - effect: NoSchedule key: hypershift.openshift.io/control-plane diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml index 44d35df32f1..ac44d424b8f 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml @@ -22,6 +22,15 @@ status: - group: "" kind: Service name: etcd-discovery + - group: rbac.authorization.k8s.io + kind: Role + name: etcd-self-register + - group: rbac.authorization.k8s.io + kind: RoleBinding + name: etcd-self-register + - group: "" + kind: ServiceAccount + name: etcd - group: policy kind: PodDisruptionBudget name: etcd diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml new file mode 100644 index 00000000000..3e97f78628b --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml @@ -0,0 +1,28 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: etcd-self-register + namespace: hcp-namespace + ownerReferences: + - apiVersion: hypershift.openshift.io/v1beta1 + blockOwnerDeletion: true + controller: true + kind: HostedControlPlane + name: hcp + uid: "" + resourceVersion: "1" +rules: +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - get + - create + - update +- apiGroups: + - "" + resources: + - pods + verbs: + - get diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml new file mode 100644 index 00000000000..f8cada01fe4 --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml @@ -0,0 +1,21 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: etcd-self-register + namespace: hcp-namespace + ownerReferences: + - apiVersion: hypershift.openshift.io/v1beta1 + blockOwnerDeletion: true + controller: true + kind: HostedControlPlane + name: hcp + uid: "" + resourceVersion: "1" +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: etcd-self-register +subjects: +- kind: ServiceAccount + name: etcd + namespace: hcp-namespace diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml new file mode 100644 index 00000000000..00f053fdf28 --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +imagePullSecrets: +- name: pull-secret +kind: ServiceAccount +metadata: + name: etcd + namespace: hcp-namespace + ownerReferences: + - apiVersion: hypershift.openshift.io/v1beta1 + blockOwnerDeletion: true + controller: true + kind: HostedControlPlane + name: hcp + uid: "" + resourceVersion: "1" diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml index ba0322c7603..46353bf2b68 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/IBMCloud/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml @@ -270,7 +270,7 @@ spec: initContainers: - args: - -c - - exec control-plane-operator resolve-dns ${HOSTNAME}.etcd-discovery.${NAMESPACE}.svc + - exec control-plane-operator resolve-dns --self-register ${HOSTNAME}.etcd-discovery.${NAMESPACE}.svc command: - /bin/bash env: @@ -279,6 +279,11 @@ spec: fieldRef: apiVersion: v1 fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP image: controlplane-operator imagePullPolicy: IfNotPresent name: ensure-dns @@ -351,6 +356,7 @@ spec: priorityClassName: hypershift-etcd restartPolicy: Always schedulerName: default-scheduler + serviceAccountName: etcd tolerations: - effect: NoSchedule key: hypershift.openshift.io/control-plane diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml index 44d35df32f1..ac44d424b8f 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml @@ -22,6 +22,15 @@ status: - group: "" kind: Service name: etcd-discovery + - group: rbac.authorization.k8s.io + kind: Role + name: etcd-self-register + - group: rbac.authorization.k8s.io + kind: RoleBinding + name: etcd-self-register + - group: "" + kind: ServiceAccount + name: etcd - group: policy kind: PodDisruptionBudget name: etcd diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml new file mode 100644 index 00000000000..3e97f78628b --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml @@ -0,0 +1,28 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: etcd-self-register + namespace: hcp-namespace + ownerReferences: + - apiVersion: hypershift.openshift.io/v1beta1 + blockOwnerDeletion: true + controller: true + kind: HostedControlPlane + name: hcp + uid: "" + resourceVersion: "1" +rules: +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - get + - create + - update +- apiGroups: + - "" + resources: + - pods + verbs: + - get diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml new file mode 100644 index 00000000000..f8cada01fe4 --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml @@ -0,0 +1,21 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: etcd-self-register + namespace: hcp-namespace + ownerReferences: + - apiVersion: hypershift.openshift.io/v1beta1 + blockOwnerDeletion: true + controller: true + kind: HostedControlPlane + name: hcp + uid: "" + resourceVersion: "1" +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: etcd-self-register +subjects: +- kind: ServiceAccount + name: etcd + namespace: hcp-namespace diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml new file mode 100644 index 00000000000..00f053fdf28 --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +imagePullSecrets: +- name: pull-secret +kind: ServiceAccount +metadata: + name: etcd + namespace: hcp-namespace + ownerReferences: + - apiVersion: hypershift.openshift.io/v1beta1 + blockOwnerDeletion: true + controller: true + kind: HostedControlPlane + name: hcp + uid: "" + resourceVersion: "1" diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml index ba0322c7603..46353bf2b68 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/TechPreviewNoUpgrade/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml @@ -270,7 +270,7 @@ spec: initContainers: - args: - -c - - exec control-plane-operator resolve-dns ${HOSTNAME}.etcd-discovery.${NAMESPACE}.svc + - exec control-plane-operator resolve-dns --self-register ${HOSTNAME}.etcd-discovery.${NAMESPACE}.svc command: - /bin/bash env: @@ -279,6 +279,11 @@ spec: fieldRef: apiVersion: v1 fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP image: controlplane-operator imagePullPolicy: IfNotPresent name: ensure-dns @@ -351,6 +356,7 @@ spec: priorityClassName: hypershift-etcd restartPolicy: Always schedulerName: default-scheduler + serviceAccountName: etcd tolerations: - effect: NoSchedule key: hypershift.openshift.io/control-plane diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml index 44d35df32f1..ac44d424b8f 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_controlplanecomponent.yaml @@ -22,6 +22,15 @@ status: - group: "" kind: Service name: etcd-discovery + - group: rbac.authorization.k8s.io + kind: Role + name: etcd-self-register + - group: rbac.authorization.k8s.io + kind: RoleBinding + name: etcd-self-register + - group: "" + kind: ServiceAccount + name: etcd - group: policy kind: PodDisruptionBudget name: etcd diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml new file mode 100644 index 00000000000..3e97f78628b --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_self_register_role.yaml @@ -0,0 +1,28 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: etcd-self-register + namespace: hcp-namespace + ownerReferences: + - apiVersion: hypershift.openshift.io/v1beta1 + blockOwnerDeletion: true + controller: true + kind: HostedControlPlane + name: hcp + uid: "" + resourceVersion: "1" +rules: +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - get + - create + - update +- apiGroups: + - "" + resources: + - pods + verbs: + - get diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml new file mode 100644 index 00000000000..f8cada01fe4 --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_self_register_rolebinding.yaml @@ -0,0 +1,21 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: etcd-self-register + namespace: hcp-namespace + ownerReferences: + - apiVersion: hypershift.openshift.io/v1beta1 + blockOwnerDeletion: true + controller: true + kind: HostedControlPlane + name: hcp + uid: "" + resourceVersion: "1" +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: etcd-self-register +subjects: +- kind: ServiceAccount + name: etcd + namespace: hcp-namespace diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml new file mode 100644 index 00000000000..00f053fdf28 --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_serviceaccount.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +imagePullSecrets: +- name: pull-secret +kind: ServiceAccount +metadata: + name: etcd + namespace: hcp-namespace + ownerReferences: + - apiVersion: hypershift.openshift.io/v1beta1 + blockOwnerDeletion: true + controller: true + kind: HostedControlPlane + name: hcp + uid: "" + resourceVersion: "1" diff --git a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml index ba0322c7603..46353bf2b68 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml +++ b/control-plane-operator/controllers/hostedcontrolplane/testdata/etcd/zz_fixture_TestControlPlaneComponents_etcd_statefulset.yaml @@ -270,7 +270,7 @@ spec: initContainers: - args: - -c - - exec control-plane-operator resolve-dns ${HOSTNAME}.etcd-discovery.${NAMESPACE}.svc + - exec control-plane-operator resolve-dns --self-register ${HOSTNAME}.etcd-discovery.${NAMESPACE}.svc command: - /bin/bash env: @@ -279,6 +279,11 @@ spec: fieldRef: apiVersion: v1 fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP image: controlplane-operator imagePullPolicy: IfNotPresent name: ensure-dns @@ -351,6 +356,7 @@ spec: priorityClassName: hypershift-etcd restartPolicy: Always schedulerName: default-scheduler + serviceAccountName: etcd tolerations: - effect: NoSchedule key: hypershift.openshift.io/control-plane diff --git a/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/defrag-rolebinding.yaml b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/defrag-rolebinding.yaml index 6f0b4b7437f..42921e5cd19 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/defrag-rolebinding.yaml +++ b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/defrag-rolebinding.yaml @@ -8,4 +8,4 @@ roleRef: name: etcd-defrag-controller subjects: - kind: ServiceAccount - name: etcd-defrag-controller + name: etcd diff --git a/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/etcd-self-register-role.yaml b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/etcd-self-register-role.yaml new file mode 100644 index 00000000000..ccedae0b47d --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/etcd-self-register-role.yaml @@ -0,0 +1,19 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: etcd-self-register +rules: +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - get + - create + - update +- apiGroups: + - "" + resources: + - pods + verbs: + - get diff --git a/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/etcd-self-register-rolebinding.yaml b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/etcd-self-register-rolebinding.yaml new file mode 100644 index 00000000000..f54159b5edc --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/etcd-self-register-rolebinding.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: etcd-self-register +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: etcd-self-register +subjects: +- kind: ServiceAccount + name: etcd diff --git a/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/defrag-serviceaccount.yaml b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/etcd-serviceaccount.yaml similarity index 59% rename from control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/defrag-serviceaccount.yaml rename to control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/etcd-serviceaccount.yaml index d82e8fe6606..4dfa197c42e 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/defrag-serviceaccount.yaml +++ b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/etcd-serviceaccount.yaml @@ -1,4 +1,4 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: etcd-defrag-controller + name: etcd diff --git a/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/statefulset.yaml b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/statefulset.yaml index cac5289f08b..9ede7c53a14 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/statefulset.yaml +++ b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/statefulset.yaml @@ -189,7 +189,7 @@ spec: initContainers: - args: - -c - - exec control-plane-operator resolve-dns ${HOSTNAME}.etcd-discovery.${NAMESPACE}.svc + - exec control-plane-operator resolve-dns --self-register ${HOSTNAME}.etcd-discovery.${NAMESPACE}.svc command: - /bin/bash env: @@ -198,6 +198,11 @@ spec: fieldRef: apiVersion: v1 fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP image: controlplane-operator imagePullPolicy: IfNotPresent name: ensure-dns diff --git a/control-plane-operator/controllers/hostedcontrolplane/v2/etcd/component.go b/control-plane-operator/controllers/hostedcontrolplane/v2/etcd/component.go index 3a09412175d..13a2b9676cc 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/v2/etcd/component.go +++ b/control-plane-operator/controllers/hostedcontrolplane/v2/etcd/component.go @@ -51,8 +51,13 @@ func NewComponent() component.ControlPlaneComponent { component.WithPredicate(defragControllerPredicate), ). WithManifestAdapter( - "defrag-serviceaccount.yaml", - component.WithPredicate(defragControllerPredicate), + "etcd-serviceaccount.yaml", + ). + WithManifestAdapter( + "etcd-self-register-role.yaml", + ). + WithManifestAdapter( + "etcd-self-register-rolebinding.yaml", ). Build() } diff --git a/control-plane-operator/controllers/hostedcontrolplane/v2/etcd/statefulset.go b/control-plane-operator/controllers/hostedcontrolplane/v2/etcd/statefulset.go index 49de8fc17d3..b43b081e42b 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/v2/etcd/statefulset.go +++ b/control-plane-operator/controllers/hostedcontrolplane/v2/etcd/statefulset.go @@ -72,9 +72,10 @@ func adaptStatefulSet(cpContext component.WorkloadContext, sts *appsv1.StatefulS ) }) + sts.Spec.Template.Spec.ServiceAccountName = manifests.EtcdServiceAccount("").Name + if defragControllerPredicate(cpContext) { sts.Spec.Template.Spec.Containers = append(sts.Spec.Template.Spec.Containers, buildEtcdDefragControllerContainer(hcp.Namespace)) - sts.Spec.Template.Spec.ServiceAccountName = manifests.EtcdDefragControllerServiceAccount("").Name } snapshotRestored := meta.IsStatusConditionTrue(hcp.Status.Conditions, string(hyperv1.EtcdSnapshotRestored)) diff --git a/dnsresolver/cmd.go b/dnsresolver/cmd.go index 1bf060af4cc..80aeca82ed0 100644 --- a/dnsresolver/cmd.go +++ b/dnsresolver/cmd.go @@ -5,20 +5,35 @@ import ( "fmt" "net" "os" + "strings" "time" + corev1 "k8s.io/api/core/v1" + discoveryv1 "k8s.io/api/discovery/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/utils/ptr" "github.com/spf13/cobra" ) func NewCommand() *cobra.Command { + var selfRegister bool + cmd := &cobra.Command{ Use: "resolve-dns NAME", Short: "Utility that ensures a DNS name can be resolved.", Run: func(cmd *cobra.Command, args []string) { if len(args) == 0 { fmt.Printf("Specify a DNS name to lookup\n") + os.Exit(1) + } + if selfRegister { + if err := selfRegisterEndpointSlice(args[0]); err != nil { + fmt.Printf("Warning: self-registration failed, falling back to DNS-only: %v\n", err) + } } if err := resolveDNS(context.Background(), args[0]); err != nil { fmt.Printf("Error: %v", err) @@ -26,6 +41,8 @@ func NewCommand() *cobra.Command { } }, } + + cmd.Flags().BoolVar(&selfRegister, "self-register", false, "Register this pod's IP into an EndpointSlice before resolving DNS") return cmd } @@ -45,3 +62,115 @@ func resolveDNS(ctx context.Context, hostName string) error { } return nil } + +// selfRegisterEndpointSlice creates an EndpointSlice for this pod's IP +// so that CoreDNS can resolve the pod's DNS name without waiting for the +// standard EndpointSlice controller, which may have a stale informer cache +// under high cluster density. +func selfRegisterEndpointSlice(dnsName string) error { + podIP := os.Getenv("POD_IP") + if podIP == "" { + return fmt.Errorf("POD_IP environment variable not set") + } + namespace := os.Getenv("NAMESPACE") + if namespace == "" { + return fmt.Errorf("NAMESPACE environment variable not set") + } + hostname, err := os.Hostname() + if err != nil { + return fmt.Errorf("failed to get hostname: %w", err) + } + + serviceName, err := parseServiceName(dnsName) + if err != nil { + return fmt.Errorf("failed to parse service name from DNS name %q: %w", dnsName, err) + } + + config, err := rest.InClusterConfig() + if err != nil { + return fmt.Errorf("failed to get in-cluster config: %w", err) + } + client, err := kubernetes.NewForConfig(config) + if err != nil { + return fmt.Errorf("failed to create kubernetes client: %w", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + pod, err := client.CoreV1().Pods(namespace).Get(ctx, hostname, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get pod %s/%s: %w", namespace, hostname, err) + } + + addressType := discoveryv1.AddressTypeIPv4 + if net.ParseIP(podIP) != nil && net.ParseIP(podIP).To4() == nil { + addressType = discoveryv1.AddressTypeIPv6 + } + + sliceName := fmt.Sprintf("%s-self-%s", serviceName, hostname) + endpointSlice := &discoveryv1.EndpointSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: sliceName, + Namespace: namespace, + Labels: map[string]string{ + discoveryv1.LabelServiceName: serviceName, + discoveryv1.LabelManagedBy: "control-plane-operator", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "v1", + Kind: "Pod", + Name: pod.Name, + UID: pod.UID, + }, + }, + }, + AddressType: addressType, + Endpoints: []discoveryv1.Endpoint{ + { + Addresses: []string{podIP}, + Hostname: ptr.To(hostname), + NodeName: ptr.To(pod.Spec.NodeName), + Conditions: discoveryv1.EndpointConditions{Ready: ptr.To(true)}, + TargetRef: &corev1.ObjectReference{ + Kind: "Pod", + Name: pod.Name, + Namespace: namespace, + UID: pod.UID, + }, + }, + }, + Ports: []discoveryv1.EndpointPort{ + {Name: ptr.To("peer"), Port: ptr.To(int32(2380)), Protocol: ptr.To(corev1.ProtocolTCP)}, + {Name: ptr.To("etcd-client"), Port: ptr.To(int32(2379)), Protocol: ptr.To(corev1.ProtocolTCP)}, + }, + } + + existing, err := client.DiscoveryV1().EndpointSlices(namespace).Get(ctx, sliceName, metav1.GetOptions{}) + if err == nil { + endpointSlice.ResourceVersion = existing.ResourceVersion + _, err = client.DiscoveryV1().EndpointSlices(namespace).Update(ctx, endpointSlice, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to update EndpointSlice %s: %w", sliceName, err) + } + fmt.Printf("Updated self-registration EndpointSlice %s with address %s\n", sliceName, podIP) + } else { + _, err = client.DiscoveryV1().EndpointSlices(namespace).Create(ctx, endpointSlice, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create EndpointSlice %s: %w", sliceName, err) + } + fmt.Printf("Created self-registration EndpointSlice %s with address %s\n", sliceName, podIP) + } + return nil +} + +// parseServiceName extracts the service name from a headless service DNS name. +// Format: ...svc[.cluster.local] +func parseServiceName(dnsName string) (string, error) { + parts := strings.Split(dnsName, ".") + if len(parts) < 3 { + return "", fmt.Errorf("expected at least 3 dot-separated components, got %d", len(parts)) + } + return parts[1], nil +} diff --git a/dnsresolver/cmd_test.go b/dnsresolver/cmd_test.go new file mode 100644 index 00000000000..39ae0e6453a --- /dev/null +++ b/dnsresolver/cmd_test.go @@ -0,0 +1,55 @@ +package dnsresolver + +import ( + "testing" + + . "github.com/onsi/gomega" +) + +func TestParseServiceName(t *testing.T) { + tests := []struct { + name string + dnsName string + expected string + expectError bool + }{ + { + name: "When given a standard headless service DNS name it should extract the service name", + dnsName: "etcd-0.etcd-discovery.my-namespace.svc", + expected: "etcd-discovery", + }, + { + name: "When given a fully qualified DNS name it should extract the service name", + dnsName: "etcd-0.etcd-discovery.my-namespace.svc.cluster.local", + expected: "etcd-discovery", + }, + { + name: "When given a DNS name with a long namespace it should extract the service name", + dnsName: "etcd-2.etcd-discovery.ocm-arohcpci01-2q7h5rjtm2oud3pn6i3890qa6p37sts3-i2y6k1a2u2a0z1h.svc", + expected: "etcd-discovery", + }, + { + name: "When given a DNS name with too few components it should return an error", + dnsName: "etcd-0.etcd-discovery", + expectError: true, + }, + { + name: "When given a single component it should return an error", + dnsName: "etcd-0", + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewGomegaWithT(t) + result, err := parseServiceName(tt.dnsName) + if tt.expectError { + g.Expect(err).To(HaveOccurred()) + } else { + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(result).To(Equal(tt.expected)) + } + }) + } +} From b877681cfffd8005c53aac602af89a4ff485881a Mon Sep 17 00:00:00 2001 From: Cliff Schomburg <7424213+cssjr@users.noreply.github.com> Date: Wed, 27 May 2026 14:16:37 -0700 Subject: [PATCH 2/7] fix(etcd): distinguish NotFound from other errors in EndpointSlice Get Use apierrors.IsNotFound to only fall through to Create when the EndpointSlice genuinely doesn't exist. RBAC, timeout, and other errors are now returned immediately instead of being masked by a subsequent Create attempt. Co-Authored-By: Claude Opus 4.6 (1M context) --- dnsresolver/cmd.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dnsresolver/cmd.go b/dnsresolver/cmd.go index 80aeca82ed0..64378fe8dd7 100644 --- a/dnsresolver/cmd.go +++ b/dnsresolver/cmd.go @@ -10,6 +10,7 @@ import ( corev1 "k8s.io/api/core/v1" discoveryv1 "k8s.io/api/discovery/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" @@ -155,12 +156,14 @@ func selfRegisterEndpointSlice(dnsName string) error { return fmt.Errorf("failed to update EndpointSlice %s: %w", sliceName, err) } fmt.Printf("Updated self-registration EndpointSlice %s with address %s\n", sliceName, podIP) - } else { + } else if apierrors.IsNotFound(err) { _, err = client.DiscoveryV1().EndpointSlices(namespace).Create(ctx, endpointSlice, metav1.CreateOptions{}) if err != nil { return fmt.Errorf("failed to create EndpointSlice %s: %w", sliceName, err) } fmt.Printf("Created self-registration EndpointSlice %s with address %s\n", sliceName, podIP) + } else { + return fmt.Errorf("failed to get EndpointSlice %s: %w", sliceName, err) } return nil } From 30ffd78ade09a5e7b425405de8f91e5591e20082 Mon Sep 17 00:00:00 2001 From: Cliff Schomburg <7424213+cssjr@users.noreply.github.com> Date: Wed, 27 May 2026 14:19:25 -0700 Subject: [PATCH 3/7] test(etcd): add unit tests for EndpointSlice self-registration Refactor selfRegisterEndpointSlice to extract testable ensureEndpointSlice function that accepts a kubernetes.Interface, enabling tests with the fake client. Tests cover: create, update, IPv6, missing pod, invalid DNS name, multiple pods, and field correctness. Coverage: ensureEndpointSlice 88.9%, parseServiceName 100%. Co-Authored-By: Claude Opus 4.6 (1M context) --- dnsresolver/cmd.go | 24 +++--- dnsresolver/cmd_test.go | 165 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+), 10 deletions(-) diff --git a/dnsresolver/cmd.go b/dnsresolver/cmd.go index 64378fe8dd7..54a57bf662a 100644 --- a/dnsresolver/cmd.go +++ b/dnsresolver/cmd.go @@ -64,10 +64,6 @@ func resolveDNS(ctx context.Context, hostName string) error { return nil } -// selfRegisterEndpointSlice creates an EndpointSlice for this pod's IP -// so that CoreDNS can resolve the pod's DNS name without waiting for the -// standard EndpointSlice controller, which may have a stale informer cache -// under high cluster density. func selfRegisterEndpointSlice(dnsName string) error { podIP := os.Getenv("POD_IP") if podIP == "" { @@ -82,11 +78,6 @@ func selfRegisterEndpointSlice(dnsName string) error { return fmt.Errorf("failed to get hostname: %w", err) } - serviceName, err := parseServiceName(dnsName) - if err != nil { - return fmt.Errorf("failed to parse service name from DNS name %q: %w", dnsName, err) - } - config, err := rest.InClusterConfig() if err != nil { return fmt.Errorf("failed to get in-cluster config: %w", err) @@ -96,7 +87,20 @@ func selfRegisterEndpointSlice(dnsName string) error { return fmt.Errorf("failed to create kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + return ensureEndpointSlice(context.Background(), client, dnsName, hostname, namespace, podIP) +} + +// ensureEndpointSlice creates or updates a self-registration EndpointSlice for +// this pod's IP so that CoreDNS can resolve the pod's DNS name without waiting +// for the standard EndpointSlice controller, which may have a stale informer +// cache under high cluster density. +func ensureEndpointSlice(ctx context.Context, client kubernetes.Interface, dnsName, hostname, namespace, podIP string) error { + serviceName, err := parseServiceName(dnsName) + if err != nil { + return fmt.Errorf("failed to parse service name from DNS name %q: %w", dnsName, err) + } + + ctx, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() pod, err := client.CoreV1().Pods(namespace).Get(ctx, hostname, metav1.GetOptions{}) diff --git a/dnsresolver/cmd_test.go b/dnsresolver/cmd_test.go index 39ae0e6453a..c5bbcf7723c 100644 --- a/dnsresolver/cmd_test.go +++ b/dnsresolver/cmd_test.go @@ -1,9 +1,17 @@ package dnsresolver import ( + "context" "testing" . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + discoveryv1 "k8s.io/api/discovery/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes/fake" ) func TestParseServiceName(t *testing.T) { @@ -53,3 +61,160 @@ func TestParseServiceName(t *testing.T) { }) } } + +func TestEnsureEndpointSlice(t *testing.T) { + const ( + namespace = "ocm-test-namespace" + hostname = "etcd-0" + podIP = "10.128.64.186" + dnsName = "etcd-0.etcd-discovery.ocm-test-namespace.svc" + podUID = "test-pod-uid-1234" + nodeName = "aks-userswft1-12345-vmss000000" + ) + + newPod := func() *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: hostname, + Namespace: namespace, + UID: types.UID(podUID), + }, + Spec: corev1.PodSpec{ + NodeName: nodeName, + }, + } + } + + t.Run("When no EndpointSlice exists it should create one with correct fields", func(t *testing.T) { + g := NewGomegaWithT(t) + client := fake.NewSimpleClientset(newPod()) + + err := ensureEndpointSlice(context.Background(), client, dnsName, hostname, namespace, podIP) + g.Expect(err).NotTo(HaveOccurred()) + + slice, err := client.DiscoveryV1().EndpointSlices(namespace).Get(context.Background(), "etcd-discovery-self-etcd-0", metav1.GetOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + + g.Expect(slice.Labels[discoveryv1.LabelServiceName]).To(Equal("etcd-discovery")) + g.Expect(slice.Labels[discoveryv1.LabelManagedBy]).To(Equal("control-plane-operator")) + g.Expect(slice.AddressType).To(Equal(discoveryv1.AddressTypeIPv4)) + g.Expect(slice.Endpoints).To(HaveLen(1)) + g.Expect(slice.Endpoints[0].Addresses).To(Equal([]string{podIP})) + g.Expect(*slice.Endpoints[0].Hostname).To(Equal(hostname)) + g.Expect(*slice.Endpoints[0].NodeName).To(Equal(nodeName)) + g.Expect(*slice.Endpoints[0].Conditions.Ready).To(BeTrue()) + g.Expect(slice.Endpoints[0].TargetRef.Kind).To(Equal("Pod")) + g.Expect(slice.Endpoints[0].TargetRef.Name).To(Equal(hostname)) + g.Expect(slice.Endpoints[0].TargetRef.UID).To(Equal(types.UID(podUID))) + g.Expect(slice.Ports).To(HaveLen(2)) + g.Expect(*slice.Ports[0].Name).To(Equal("peer")) + g.Expect(*slice.Ports[0].Port).To(Equal(int32(2380))) + g.Expect(*slice.Ports[1].Name).To(Equal("etcd-client")) + g.Expect(*slice.Ports[1].Port).To(Equal(int32(2379))) + g.Expect(slice.OwnerReferences).To(HaveLen(1)) + g.Expect(slice.OwnerReferences[0].Kind).To(Equal("Pod")) + g.Expect(slice.OwnerReferences[0].Name).To(Equal(hostname)) + g.Expect(slice.OwnerReferences[0].UID).To(Equal(types.UID(podUID))) + }) + + t.Run("When an EndpointSlice already exists it should update it", func(t *testing.T) { + g := NewGomegaWithT(t) + existingSlice := &discoveryv1.EndpointSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: "etcd-discovery-self-etcd-0", + Namespace: namespace, + Labels: map[string]string{ + discoveryv1.LabelServiceName: "etcd-discovery", + discoveryv1.LabelManagedBy: "control-plane-operator", + }, + }, + AddressType: discoveryv1.AddressTypeIPv4, + Endpoints: []discoveryv1.Endpoint{ + {Addresses: []string{"10.0.0.99"}}, + }, + } + client := fake.NewSimpleClientset(newPod(), existingSlice) + + err := ensureEndpointSlice(context.Background(), client, dnsName, hostname, namespace, podIP) + g.Expect(err).NotTo(HaveOccurred()) + + slice, err := client.DiscoveryV1().EndpointSlices(namespace).Get(context.Background(), "etcd-discovery-self-etcd-0", metav1.GetOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(slice.Endpoints[0].Addresses).To(Equal([]string{podIP})) + }) + + t.Run("When given an IPv6 address it should set AddressTypeIPv6", func(t *testing.T) { + g := NewGomegaWithT(t) + client := fake.NewSimpleClientset(newPod()) + ipv6 := "fd00::1" + + err := ensureEndpointSlice(context.Background(), client, dnsName, hostname, namespace, ipv6) + g.Expect(err).NotTo(HaveOccurred()) + + slice, err := client.DiscoveryV1().EndpointSlices(namespace).Get(context.Background(), "etcd-discovery-self-etcd-0", metav1.GetOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(slice.AddressType).To(Equal(discoveryv1.AddressTypeIPv6)) + g.Expect(slice.Endpoints[0].Addresses).To(Equal([]string{ipv6})) + }) + + t.Run("When the pod does not exist it should return an error", func(t *testing.T) { + g := NewGomegaWithT(t) + client := fake.NewSimpleClientset() + + err := ensureEndpointSlice(context.Background(), client, dnsName, hostname, namespace, podIP) + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring("failed to get pod")) + }) + + t.Run("When given an invalid DNS name it should return an error", func(t *testing.T) { + g := NewGomegaWithT(t) + client := fake.NewSimpleClientset(newPod()) + + err := ensureEndpointSlice(context.Background(), client, "invalid", hostname, namespace, podIP) + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring("failed to parse service name")) + }) + + t.Run("When called for etcd-1 it should use the correct slice name and hostname", func(t *testing.T) { + g := NewGomegaWithT(t) + pod1 := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "etcd-1", + Namespace: namespace, + UID: "uid-etcd-1", + }, + Spec: corev1.PodSpec{NodeName: nodeName}, + } + client := fake.NewSimpleClientset(pod1) + + err := ensureEndpointSlice(context.Background(), client, "etcd-1.etcd-discovery.ocm-test-namespace.svc", "etcd-1", namespace, "10.128.64.187") + g.Expect(err).NotTo(HaveOccurred()) + + slice, err := client.DiscoveryV1().EndpointSlices(namespace).Get(context.Background(), "etcd-discovery-self-etcd-1", metav1.GetOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(*slice.Endpoints[0].Hostname).To(Equal("etcd-1")) + g.Expect(slice.Endpoints[0].Addresses).To(Equal([]string{"10.128.64.187"})) + }) + + t.Run("When multiple etcd pods self-register it should create separate EndpointSlices", func(t *testing.T) { + g := NewGomegaWithT(t) + pods := []runtime.Object{ + &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "etcd-0", Namespace: namespace, UID: "uid-0"}, Spec: corev1.PodSpec{NodeName: nodeName}}, + &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "etcd-1", Namespace: namespace, UID: "uid-1"}, Spec: corev1.PodSpec{NodeName: nodeName}}, + &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "etcd-2", Namespace: namespace, UID: "uid-2"}, Spec: corev1.PodSpec{NodeName: nodeName}}, + } + client := fake.NewSimpleClientset(pods...) + + for i := range 3 { + h := metav1.ObjectMeta{Name: pods[i].(*corev1.Pod).Name}.Name + dns := h + ".etcd-discovery." + namespace + ".svc" + ip := "10.128.64." + string(rune('1'+i)) + err := ensureEndpointSlice(context.Background(), client, dns, h, namespace, ip) + g.Expect(err).NotTo(HaveOccurred()) + } + + slices, err := client.DiscoveryV1().EndpointSlices(namespace).List(context.Background(), metav1.ListOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(slices.Items).To(HaveLen(3)) + }) +} From 8db6372ec129005c140e29d83be4e0d15f3c3aee Mon Sep 17 00:00:00 2001 From: Cliff Schomburg <7424213+cssjr@users.noreply.github.com> Date: Wed, 27 May 2026 14:33:12 -0700 Subject: [PATCH 4/7] fix(etcd): replace deprecated fake.NewSimpleClientset with NewClientset Co-Authored-By: Claude Opus 4.6 (1M context) --- dnsresolver/cmd_test.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dnsresolver/cmd_test.go b/dnsresolver/cmd_test.go index c5bbcf7723c..81859429e8e 100644 --- a/dnsresolver/cmd_test.go +++ b/dnsresolver/cmd_test.go @@ -87,7 +87,7 @@ func TestEnsureEndpointSlice(t *testing.T) { t.Run("When no EndpointSlice exists it should create one with correct fields", func(t *testing.T) { g := NewGomegaWithT(t) - client := fake.NewSimpleClientset(newPod()) + client := fake.NewClientset(newPod()) err := ensureEndpointSlice(context.Background(), client, dnsName, hostname, namespace, podIP) g.Expect(err).NotTo(HaveOccurred()) @@ -133,7 +133,7 @@ func TestEnsureEndpointSlice(t *testing.T) { {Addresses: []string{"10.0.0.99"}}, }, } - client := fake.NewSimpleClientset(newPod(), existingSlice) + client := fake.NewClientset(newPod(), existingSlice) err := ensureEndpointSlice(context.Background(), client, dnsName, hostname, namespace, podIP) g.Expect(err).NotTo(HaveOccurred()) @@ -145,7 +145,7 @@ func TestEnsureEndpointSlice(t *testing.T) { t.Run("When given an IPv6 address it should set AddressTypeIPv6", func(t *testing.T) { g := NewGomegaWithT(t) - client := fake.NewSimpleClientset(newPod()) + client := fake.NewClientset(newPod()) ipv6 := "fd00::1" err := ensureEndpointSlice(context.Background(), client, dnsName, hostname, namespace, ipv6) @@ -159,7 +159,7 @@ func TestEnsureEndpointSlice(t *testing.T) { t.Run("When the pod does not exist it should return an error", func(t *testing.T) { g := NewGomegaWithT(t) - client := fake.NewSimpleClientset() + client := fake.NewClientset() err := ensureEndpointSlice(context.Background(), client, dnsName, hostname, namespace, podIP) g.Expect(err).To(HaveOccurred()) @@ -168,7 +168,7 @@ func TestEnsureEndpointSlice(t *testing.T) { t.Run("When given an invalid DNS name it should return an error", func(t *testing.T) { g := NewGomegaWithT(t) - client := fake.NewSimpleClientset(newPod()) + client := fake.NewClientset(newPod()) err := ensureEndpointSlice(context.Background(), client, "invalid", hostname, namespace, podIP) g.Expect(err).To(HaveOccurred()) @@ -185,7 +185,7 @@ func TestEnsureEndpointSlice(t *testing.T) { }, Spec: corev1.PodSpec{NodeName: nodeName}, } - client := fake.NewSimpleClientset(pod1) + client := fake.NewClientset(pod1) err := ensureEndpointSlice(context.Background(), client, "etcd-1.etcd-discovery.ocm-test-namespace.svc", "etcd-1", namespace, "10.128.64.187") g.Expect(err).NotTo(HaveOccurred()) @@ -203,7 +203,7 @@ func TestEnsureEndpointSlice(t *testing.T) { &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "etcd-1", Namespace: namespace, UID: "uid-1"}, Spec: corev1.PodSpec{NodeName: nodeName}}, &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "etcd-2", Namespace: namespace, UID: "uid-2"}, Spec: corev1.PodSpec{NodeName: nodeName}}, } - client := fake.NewSimpleClientset(pods...) + client := fake.NewClientset(pods...) for i := range 3 { h := metav1.ObjectMeta{Name: pods[i].(*corev1.Pod).Name}.Name From 6597d3d49e5d8c1418e669f33461cae647e2e141 Mon Sep 17 00:00:00 2001 From: Cliff Schomburg <7424213+cssjr@users.noreply.github.com> Date: Fri, 29 May 2026 10:40:34 -0700 Subject: [PATCH 5/7] fix(etcd): address PR feedback for backwards-compatible SA handling This addresses review feedback on PR #8613 to maintain backwards compatibility during ServiceAccount changes. Blocking issues fixed: - Keep etcd-defrag-controller SA (HA-only) separate from etcd SA (always present) to avoid orphaned SAs on upgrade - Use fully qualified managed-by label "etcd-self-register.hypershift.openshift.io" to avoid future cross-contamination with LabelManagedBy filters Suggestions implemented: - Add Controller and BlockOwnerDeletion to OwnerReference for explicit GC intent - Replace context.Background() with t.Context() in tests to prevent goroutine leaks - Use gomega MatchError for cleaner error assertions - Simplify IP generation in tests with fmt.Sprintf The SA split works as follows: - HA mode: use etcd-defrag-controller SA with both defrag and self-register roles bound to it - Non-HA mode: use etcd SA with only self-register role This keeps the PR scoped to self-registration without upgrade risk. Signed-off-by: Cliff Schomburg <7424213+cssjr@users.noreply.github.com> Commit-Message-Assisted-by: Claude (via Claude Code) --- .../v2/assets/etcd/defrag-rolebinding.yaml | 2 +- .../v2/assets/etcd/defrag-serviceaccount.yaml | 4 ++ ...etcd-self-register-rolebinding-defrag.yaml | 11 ++++++ .../hostedcontrolplane/v2/etcd/component.go | 8 ++++ .../hostedcontrolplane/v2/etcd/statefulset.go | 8 +++- dnsresolver/cmd.go | 12 +++--- dnsresolver/cmd_test.go | 38 +++++++++---------- 7 files changed, 55 insertions(+), 28 deletions(-) create mode 100644 control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/defrag-serviceaccount.yaml create mode 100644 control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/etcd-self-register-rolebinding-defrag.yaml diff --git a/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/defrag-rolebinding.yaml b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/defrag-rolebinding.yaml index 42921e5cd19..6f0b4b7437f 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/defrag-rolebinding.yaml +++ b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/defrag-rolebinding.yaml @@ -8,4 +8,4 @@ roleRef: name: etcd-defrag-controller subjects: - kind: ServiceAccount - name: etcd + name: etcd-defrag-controller diff --git a/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/defrag-serviceaccount.yaml b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/defrag-serviceaccount.yaml new file mode 100644 index 00000000000..d82e8fe6606 --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/defrag-serviceaccount.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: etcd-defrag-controller diff --git a/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/etcd-self-register-rolebinding-defrag.yaml b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/etcd-self-register-rolebinding-defrag.yaml new file mode 100644 index 00000000000..34b2781493a --- /dev/null +++ b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/etcd-self-register-rolebinding-defrag.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: etcd-self-register-defrag +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: etcd-self-register +subjects: +- kind: ServiceAccount + name: etcd-defrag-controller diff --git a/control-plane-operator/controllers/hostedcontrolplane/v2/etcd/component.go b/control-plane-operator/controllers/hostedcontrolplane/v2/etcd/component.go index 13a2b9676cc..6ae9bd12072 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/v2/etcd/component.go +++ b/control-plane-operator/controllers/hostedcontrolplane/v2/etcd/component.go @@ -50,6 +50,10 @@ func NewComponent() component.ControlPlaneComponent { "defrag-rolebinding.yaml", component.WithPredicate(defragControllerPredicate), ). + WithManifestAdapter( + "defrag-serviceaccount.yaml", + component.WithPredicate(defragControllerPredicate), + ). WithManifestAdapter( "etcd-serviceaccount.yaml", ). @@ -59,6 +63,10 @@ func NewComponent() component.ControlPlaneComponent { WithManifestAdapter( "etcd-self-register-rolebinding.yaml", ). + WithManifestAdapter( + "etcd-self-register-rolebinding-defrag.yaml", + component.WithPredicate(defragControllerPredicate), + ). Build() } diff --git a/control-plane-operator/controllers/hostedcontrolplane/v2/etcd/statefulset.go b/control-plane-operator/controllers/hostedcontrolplane/v2/etcd/statefulset.go index b43b081e42b..20ba6305f4e 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/v2/etcd/statefulset.go +++ b/control-plane-operator/controllers/hostedcontrolplane/v2/etcd/statefulset.go @@ -72,10 +72,14 @@ func adaptStatefulSet(cpContext component.WorkloadContext, sts *appsv1.StatefulS ) }) - sts.Spec.Template.Spec.ServiceAccountName = manifests.EtcdServiceAccount("").Name - + // Use etcd SA for self-registration (all topologies) or etcd-defrag-controller SA for defrag (HA-only). + // The etcd SA is always created and has RBAC for EndpointSlice self-registration. + // The etcd-defrag-controller SA is only created in HA mode and has RBAC for defragmentation. if defragControllerPredicate(cpContext) { + sts.Spec.Template.Spec.ServiceAccountName = manifests.EtcdDefragControllerServiceAccount("").Name sts.Spec.Template.Spec.Containers = append(sts.Spec.Template.Spec.Containers, buildEtcdDefragControllerContainer(hcp.Namespace)) + } else { + sts.Spec.Template.Spec.ServiceAccountName = manifests.EtcdServiceAccount("").Name } snapshotRestored := meta.IsStatusConditionTrue(hcp.Status.Conditions, string(hyperv1.EtcdSnapshotRestored)) diff --git a/dnsresolver/cmd.go b/dnsresolver/cmd.go index 54a57bf662a..2cd1a22e90e 100644 --- a/dnsresolver/cmd.go +++ b/dnsresolver/cmd.go @@ -120,14 +120,16 @@ func ensureEndpointSlice(ctx context.Context, client kubernetes.Interface, dnsNa Namespace: namespace, Labels: map[string]string{ discoveryv1.LabelServiceName: serviceName, - discoveryv1.LabelManagedBy: "control-plane-operator", + discoveryv1.LabelManagedBy: "etcd-self-register.hypershift.openshift.io", }, OwnerReferences: []metav1.OwnerReference{ { - APIVersion: "v1", - Kind: "Pod", - Name: pod.Name, - UID: pod.UID, + APIVersion: "v1", + Kind: "Pod", + Name: pod.Name, + UID: pod.UID, + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), }, }, }, diff --git a/dnsresolver/cmd_test.go b/dnsresolver/cmd_test.go index 81859429e8e..82fdbdb7678 100644 --- a/dnsresolver/cmd_test.go +++ b/dnsresolver/cmd_test.go @@ -1,7 +1,7 @@ package dnsresolver import ( - "context" + "fmt" "testing" . "github.com/onsi/gomega" @@ -89,14 +89,14 @@ func TestEnsureEndpointSlice(t *testing.T) { g := NewGomegaWithT(t) client := fake.NewClientset(newPod()) - err := ensureEndpointSlice(context.Background(), client, dnsName, hostname, namespace, podIP) + err := ensureEndpointSlice(t.Context(), client, dnsName, hostname, namespace, podIP) g.Expect(err).NotTo(HaveOccurred()) - slice, err := client.DiscoveryV1().EndpointSlices(namespace).Get(context.Background(), "etcd-discovery-self-etcd-0", metav1.GetOptions{}) + slice, err := client.DiscoveryV1().EndpointSlices(namespace).Get(t.Context(), "etcd-discovery-self-etcd-0", metav1.GetOptions{}) g.Expect(err).NotTo(HaveOccurred()) g.Expect(slice.Labels[discoveryv1.LabelServiceName]).To(Equal("etcd-discovery")) - g.Expect(slice.Labels[discoveryv1.LabelManagedBy]).To(Equal("control-plane-operator")) + g.Expect(slice.Labels[discoveryv1.LabelManagedBy]).To(Equal("etcd-self-register.hypershift.openshift.io")) g.Expect(slice.AddressType).To(Equal(discoveryv1.AddressTypeIPv4)) g.Expect(slice.Endpoints).To(HaveLen(1)) g.Expect(slice.Endpoints[0].Addresses).To(Equal([]string{podIP})) @@ -125,7 +125,7 @@ func TestEnsureEndpointSlice(t *testing.T) { Namespace: namespace, Labels: map[string]string{ discoveryv1.LabelServiceName: "etcd-discovery", - discoveryv1.LabelManagedBy: "control-plane-operator", + discoveryv1.LabelManagedBy: "etcd-self-register.hypershift.openshift.io", }, }, AddressType: discoveryv1.AddressTypeIPv4, @@ -135,10 +135,10 @@ func TestEnsureEndpointSlice(t *testing.T) { } client := fake.NewClientset(newPod(), existingSlice) - err := ensureEndpointSlice(context.Background(), client, dnsName, hostname, namespace, podIP) + err := ensureEndpointSlice(t.Context(), client, dnsName, hostname, namespace, podIP) g.Expect(err).NotTo(HaveOccurred()) - slice, err := client.DiscoveryV1().EndpointSlices(namespace).Get(context.Background(), "etcd-discovery-self-etcd-0", metav1.GetOptions{}) + slice, err := client.DiscoveryV1().EndpointSlices(namespace).Get(t.Context(), "etcd-discovery-self-etcd-0", metav1.GetOptions{}) g.Expect(err).NotTo(HaveOccurred()) g.Expect(slice.Endpoints[0].Addresses).To(Equal([]string{podIP})) }) @@ -148,10 +148,10 @@ func TestEnsureEndpointSlice(t *testing.T) { client := fake.NewClientset(newPod()) ipv6 := "fd00::1" - err := ensureEndpointSlice(context.Background(), client, dnsName, hostname, namespace, ipv6) + err := ensureEndpointSlice(t.Context(), client, dnsName, hostname, namespace, ipv6) g.Expect(err).NotTo(HaveOccurred()) - slice, err := client.DiscoveryV1().EndpointSlices(namespace).Get(context.Background(), "etcd-discovery-self-etcd-0", metav1.GetOptions{}) + slice, err := client.DiscoveryV1().EndpointSlices(namespace).Get(t.Context(), "etcd-discovery-self-etcd-0", metav1.GetOptions{}) g.Expect(err).NotTo(HaveOccurred()) g.Expect(slice.AddressType).To(Equal(discoveryv1.AddressTypeIPv6)) g.Expect(slice.Endpoints[0].Addresses).To(Equal([]string{ipv6})) @@ -161,18 +161,16 @@ func TestEnsureEndpointSlice(t *testing.T) { g := NewGomegaWithT(t) client := fake.NewClientset() - err := ensureEndpointSlice(context.Background(), client, dnsName, hostname, namespace, podIP) - g.Expect(err).To(HaveOccurred()) - g.Expect(err.Error()).To(ContainSubstring("failed to get pod")) + err := ensureEndpointSlice(t.Context(), client, dnsName, hostname, namespace, podIP) + g.Expect(err).To(MatchError(ContainSubstring("failed to get pod"))) }) t.Run("When given an invalid DNS name it should return an error", func(t *testing.T) { g := NewGomegaWithT(t) client := fake.NewClientset(newPod()) - err := ensureEndpointSlice(context.Background(), client, "invalid", hostname, namespace, podIP) - g.Expect(err).To(HaveOccurred()) - g.Expect(err.Error()).To(ContainSubstring("failed to parse service name")) + err := ensureEndpointSlice(t.Context(), client, "invalid", hostname, namespace, podIP) + g.Expect(err).To(MatchError(ContainSubstring("failed to parse service name"))) }) t.Run("When called for etcd-1 it should use the correct slice name and hostname", func(t *testing.T) { @@ -187,10 +185,10 @@ func TestEnsureEndpointSlice(t *testing.T) { } client := fake.NewClientset(pod1) - err := ensureEndpointSlice(context.Background(), client, "etcd-1.etcd-discovery.ocm-test-namespace.svc", "etcd-1", namespace, "10.128.64.187") + err := ensureEndpointSlice(t.Context(), client, "etcd-1.etcd-discovery.ocm-test-namespace.svc", "etcd-1", namespace, "10.128.64.187") g.Expect(err).NotTo(HaveOccurred()) - slice, err := client.DiscoveryV1().EndpointSlices(namespace).Get(context.Background(), "etcd-discovery-self-etcd-1", metav1.GetOptions{}) + slice, err := client.DiscoveryV1().EndpointSlices(namespace).Get(t.Context(), "etcd-discovery-self-etcd-1", metav1.GetOptions{}) g.Expect(err).NotTo(HaveOccurred()) g.Expect(*slice.Endpoints[0].Hostname).To(Equal("etcd-1")) g.Expect(slice.Endpoints[0].Addresses).To(Equal([]string{"10.128.64.187"})) @@ -208,12 +206,12 @@ func TestEnsureEndpointSlice(t *testing.T) { for i := range 3 { h := metav1.ObjectMeta{Name: pods[i].(*corev1.Pod).Name}.Name dns := h + ".etcd-discovery." + namespace + ".svc" - ip := "10.128.64." + string(rune('1'+i)) - err := ensureEndpointSlice(context.Background(), client, dns, h, namespace, ip) + ip := fmt.Sprintf("10.128.64.%d", 186+i) + err := ensureEndpointSlice(t.Context(), client, dns, h, namespace, ip) g.Expect(err).NotTo(HaveOccurred()) } - slices, err := client.DiscoveryV1().EndpointSlices(namespace).List(context.Background(), metav1.ListOptions{}) + slices, err := client.DiscoveryV1().EndpointSlices(namespace).List(t.Context(), metav1.ListOptions{}) g.Expect(err).NotTo(HaveOccurred()) g.Expect(slices.Items).To(HaveLen(3)) }) From c8b0fa11ed09637d8d49c92276f08062aed28819 Mon Sep 17 00:00:00 2001 From: Cliff Schomburg <7424213+cssjr@users.noreply.github.com> Date: Wed, 3 Jun 2026 07:02:55 -0700 Subject: [PATCH 6/7] docs(etcd): add comments explaining --self-register flag Address feedback from @enxebre to document why --self-register is needed and improve IP address parsing. Changes: - Added comment in statefulset.yaml explaining --self-register flag purpose (bypasses stale EndpointSlice cache under high HCP density) - Added comment in dnsresolver/cmd.go documenting the flag - Replaced manual net.ParseIP().To4() check with netutil.IsIPv4Address() which properly handles ParseIP returning nil and returns an error instead of silently failing Signed-off-by: Cliff Schomburg <7424213+cssjr@users.noreply.github.com> Commit-Message-Assisted-by: Claude (via Claude Code) --- .../v2/assets/etcd/statefulset.yaml | 3 +++ dnsresolver/cmd.go | 16 +++++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/statefulset.yaml b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/statefulset.yaml index 9ede7c53a14..ffd0e2124ea 100644 --- a/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/statefulset.yaml +++ b/control-plane-operator/controllers/hostedcontrolplane/v2/assets/etcd/statefulset.yaml @@ -187,6 +187,9 @@ spec: - mountPath: /etc/etcd/tls/etcd-ca name: etcd-ca initContainers: + # Self-register this pod's IP into an EndpointSlice before waiting for DNS resolution. + # This bypasses stale kube-controller-manager EndpointSlice cache issues under high HCP density + # (2,500+ namespaces), where the standard controller can delay DNS updates by minutes to hours. - args: - -c - exec control-plane-operator resolve-dns --self-register ${HOSTNAME}.etcd-discovery.${NAMESPACE}.svc diff --git a/dnsresolver/cmd.go b/dnsresolver/cmd.go index 2cd1a22e90e..814d4efa759 100644 --- a/dnsresolver/cmd.go +++ b/dnsresolver/cmd.go @@ -8,6 +8,8 @@ import ( "strings" "time" + "github.com/openshift/hypershift/support/netutil" + corev1 "k8s.io/api/core/v1" discoveryv1 "k8s.io/api/discovery/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -43,6 +45,9 @@ func NewCommand() *cobra.Command { }, } + // --self-register creates an EndpointSlice for this pod before waiting for DNS resolution. + // This bypasses stale kube-controller-manager EndpointSlice cache issues under high HCP density, + // where the standard controller can delay DNS updates by minutes to hours. cmd.Flags().BoolVar(&selfRegister, "self-register", false, "Register this pod's IP into an EndpointSlice before resolving DNS") return cmd } @@ -108,9 +113,14 @@ func ensureEndpointSlice(ctx context.Context, client kubernetes.Interface, dnsNa return fmt.Errorf("failed to get pod %s/%s: %w", namespace, hostname, err) } - addressType := discoveryv1.AddressTypeIPv4 - if net.ParseIP(podIP) != nil && net.ParseIP(podIP).To4() == nil { - addressType = discoveryv1.AddressTypeIPv6 + // Determine address type based on pod IP + isIPv4, err := netutil.IsIPv4Address(podIP) + if err != nil { + return fmt.Errorf("failed to parse pod IP %s: %w", podIP, err) + } + addressType := discoveryv1.AddressTypeIPv6 + if isIPv4 { + addressType = discoveryv1.AddressTypeIPv4 } sliceName := fmt.Sprintf("%s-self-%s", serviceName, hostname) From 824480d5932474d17b025ca220f02af7c7c0be3a Mon Sep 17 00:00:00 2001 From: Cliff Schomburg <7424213+cssjr@users.noreply.github.com> Date: Wed, 3 Jun 2026 07:16:25 -0700 Subject: [PATCH 7/7] test(etcd): add test case for malformed IP address handling Add test to verify that ensureEndpointSlice properly returns an error when given a malformed IP address (not IPv4 or IPv6). This ensures the netutil.IsIPv4Address() error handling works correctly and doesn't silently fail on invalid input. Signed-off-by: Cliff Schomburg <7424213+cssjr@users.noreply.github.com> Commit-Message-Assisted-by: Claude (via Claude Code) --- dnsresolver/cmd_test.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dnsresolver/cmd_test.go b/dnsresolver/cmd_test.go index 82fdbdb7678..fa14e80f8fc 100644 --- a/dnsresolver/cmd_test.go +++ b/dnsresolver/cmd_test.go @@ -173,6 +173,15 @@ func TestEnsureEndpointSlice(t *testing.T) { g.Expect(err).To(MatchError(ContainSubstring("failed to parse service name"))) }) + t.Run("When given a malformed IP address it should return an error", func(t *testing.T) { + g := NewGomegaWithT(t) + client := fake.NewClientset(newPod()) + invalidIP := "not-an-ip-address" + + err := ensureEndpointSlice(t.Context(), client, dnsName, hostname, namespace, invalidIP) + g.Expect(err).To(MatchError(ContainSubstring("failed to parse pod IP"))) + }) + t.Run("When called for etcd-1 it should use the correct slice name and hostname", func(t *testing.T) { g := NewGomegaWithT(t) pod1 := &corev1.Pod{