Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 59 additions & 8 deletions hypershift-operator/controllers/nodepool/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,7 @@ func (r *NodePoolReconciler) setMachineAndNodeConditions(ctx context.Context, no

r.setAllNodesHealthyCondition(nodePool, machines)

err = r.setCIDRConflictCondition(nodePool, machines, hc)
err = r.setCIDRConflictCondition(ctx, nodePool, machines, hc)
if err != nil {
return err
}
Expand Down Expand Up @@ -747,33 +747,82 @@ func (r *NodePoolReconciler) setAllMachinesReadyCondition(nodePool *hyperv1.Node
SetStatusCondition(&nodePool.Status.Conditions, *allMachinesReadyCondition)
}

func (r *NodePoolReconciler) setCIDRConflictCondition(nodePool *hyperv1.NodePool, machines []*capiv1.Machine, hc *hyperv1.HostedCluster) error {
type cidrConflictEntry struct {
ip string
cidr string
}

func (r *NodePoolReconciler) setCIDRConflictCondition(ctx context.Context, nodePool *hyperv1.NodePool, machines []*capiv1.Machine, hc *hyperv1.HostedCluster) error {
log := ctrl.LoggerFrom(ctx)
maxMessageLength := 256

if len(machines) < 1 || len(hc.Spec.Networking.ClusterNetwork) < 1 {
removeStatusCondition(&nodePool.Status.Conditions, hyperv1.NodePoolClusterNetworkCIDRConflictType)
return nil
}

clusterNetworkStr := hc.Spec.Networking.ClusterNetwork[0].CIDR.String()
clusterNetwork, err := netip.ParsePrefix(clusterNetworkStr)
if err != nil {
return err
var clusterNetworks []netip.Prefix
for _, cn := range hc.Spec.Networking.ClusterNetwork {
prefix, err := netip.ParsePrefix(cn.CIDR.String())
if err != nil {
return err
}
clusterNetworks = append(clusterNetworks, prefix)
}

messages := []string{}
for _, machine := range machines {
seen := make(map[string]struct{})
var conflicting []cidrConflictEntry
hasAddressOutsideClusterNetwork := false

for _, addr := range machine.Status.Addresses {
if addr.Type != capiv1.MachineExternalIP && addr.Type != capiv1.MachineInternalIP {
continue
}

ipaddr, err := netip.ParseAddr(addr.Address)
if err != nil {
return err
}
if clusterNetwork.Contains(ipaddr) {
messages = append(messages, fmt.Sprintf("machine [%s] with ip [%s] collides with cluster-network cidr [%s]", machine.Name, addr.Address, clusterNetworkStr))
key := ipaddr.String()
if _, ok := seen[key]; ok {
continue
}
seen[key] = struct{}{}

if ipaddr.IsLinkLocalUnicast() || ipaddr.IsLinkLocalMulticast() {
continue
}
Comment on lines +794 to +796

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[question] What happens when a machine has zero qualifying addresses — i.e., all addresses are link-local? In that case hasAddressOutsideClusterNetwork stays false and conflicting is empty, so the !hasAddressOutsideClusterNetwork branch enters but appends nothing. The behavior is correct (no conflict message), but is a machine with only link-local addresses a valid state? If not, should we flag it differently (e.g., a warning)? If it is valid, please add a test case to document that expectation (see my test comment below).

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When a machine has only link-local addresses, both hasAddressOutsideClusterNetwork and conflicting remain empty, so no conflict is reported and no condition is set. This is correct because link-local addresses are not routable infrastructure IPs and should not influence the heuristic in either direction.

Added a test documenting this expectation: "When machine has only link-local addresses it should not report cidr collision"


matchedCIDR := ""
for _, cn := range clusterNetworks {
if cn.Contains(ipaddr) {
matchedCIDR = cn.String()
break
}
}
if matchedCIDR != "" {
conflicting = append(conflicting, cidrConflictEntry{ip: addr.Address, cidr: matchedCIDR})
} else {
hasAddressOutsideClusterNetwork = true
}
}

// When a machine has addresses both inside and outside the cluster network,
// the in-network addresses are typically CNI-internal (e.g. OVN-Kubernetes
// management port IPs on KubeVirt VMs) and do not represent a real conflict.
// Only report a conflict when ALL of a machine's addresses fall within the
// cluster network, which indicates the machine's infrastructure IP genuinely
// overlaps with the pod CIDR.
if !hasAddressOutsideClusterNetwork {
for _, entry := range conflicting {
messages = append(messages, fmt.Sprintf("machine [%s] with ip [%s] collides with cluster-network cidr [%s]", machine.Name, entry.ip, entry.cidr))
}
} else if len(conflicting) > 0 {
log.V(4).Info("Skipping CNI-internal addresses for CIDR conflict check",
"machine", machine.Name,
"suppressedCount", len(conflicting))
}
}
Comment on lines +818 to 827

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[suggestion] Consider emitting a structured log line at debug level when addresses are suppressed by the heuristic. Operators debugging networking issues on KubeVirt clusters might want to see that addresses were evaluated but deemed CNI-internal:

if hasAddressOutsideClusterNetwork && len(conflicting) > 0 {
	log.Log.V(4).Info("Skipping CNI-internal addresses for CIDR conflict check",
		"machine", machine.Name,
		"suppressedCount", len(conflicting))
}

This is optional but would help day-2 troubleshooting.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Added a log.V(4).Info call when addresses are suppressed:

} else if len(conflicting) > 0 {
    log.V(4).Info("Skipping CNI-internal addresses for CIDR conflict check",
        "machine", machine.Name,
        "suppressedCount", len(conflicting))
}

Also added ctx to the function signature to get a structured logger via ctrl.LoggerFrom(ctx).


Expand All @@ -798,6 +847,8 @@ func (r *NodePoolReconciler) setCIDRConflictCondition(nodePool *hyperv1.NodePool
ObservedGeneration: nodePool.Generation,
}
SetStatusCondition(&nodePool.Status.Conditions, *cidrConflictCondition)
} else {
removeStatusCondition(&nodePool.Status.Conditions, hyperv1.NodePoolClusterNetworkCIDRConflictType)
Comment on lines +850 to +851

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[blocking] This else branch with removeStatusCondition is new behavior — previously the condition was never cleared once set. This is a great operational improvement, but no test verifies it.

Please add a test case that:

  1. Pre-populates a NodePool with an existing ClusterNetworkCIDRConflictType condition (Status=True)
  2. Calls setCIDRConflictCondition with machines that have addresses outside the cluster network (so no conflict should be reported)
  3. Asserts the condition is removed from nodePool.Status.Conditions

Example name: "When previously-conflicting machine gains out-of-cluster address it should clear the cidr conflict condition"

Without this test, a regression that breaks condition clearing would go undetected.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Added test case: "When previously-conflicting machine gains out-of-cluster address it should clear the cidr conflict condition"

It pre-populates the NodePool with an existing ClusterNetworkCIDRConflictType=True condition, then calls setCIDRConflictCondition with a machine that has an outside address, and asserts the condition is removed.

}

return nil
Expand Down
Loading