-
Notifications
You must be signed in to change notification settings - Fork 495
OCPBUGS-84269: fix(nodepool): skip CNI-internal IPs in ClusterNetworkCIDRConflict check #8230
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -599,7 +599,7 @@ func (r *NodePoolReconciler) setMachineAndNodeConditions(ctx context.Context, no | |
|
|
||
| r.setAllNodesHealthyCondition(nodePool, machines) | ||
|
|
||
| err = r.setCIDRConflictCondition(nodePool, machines, hc) | ||
| err = r.setCIDRConflictCondition(ctx, nodePool, machines, hc) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
@@ -747,33 +747,82 @@ func (r *NodePoolReconciler) setAllMachinesReadyCondition(nodePool *hyperv1.Node | |
| SetStatusCondition(&nodePool.Status.Conditions, *allMachinesReadyCondition) | ||
| } | ||
|
|
||
| func (r *NodePoolReconciler) setCIDRConflictCondition(nodePool *hyperv1.NodePool, machines []*capiv1.Machine, hc *hyperv1.HostedCluster) error { | ||
| type cidrConflictEntry struct { | ||
| ip string | ||
| cidr string | ||
| } | ||
|
|
||
| func (r *NodePoolReconciler) setCIDRConflictCondition(ctx context.Context, nodePool *hyperv1.NodePool, machines []*capiv1.Machine, hc *hyperv1.HostedCluster) error { | ||
| log := ctrl.LoggerFrom(ctx) | ||
| maxMessageLength := 256 | ||
|
|
||
| if len(machines) < 1 || len(hc.Spec.Networking.ClusterNetwork) < 1 { | ||
| removeStatusCondition(&nodePool.Status.Conditions, hyperv1.NodePoolClusterNetworkCIDRConflictType) | ||
| return nil | ||
| } | ||
|
|
||
| clusterNetworkStr := hc.Spec.Networking.ClusterNetwork[0].CIDR.String() | ||
| clusterNetwork, err := netip.ParsePrefix(clusterNetworkStr) | ||
| if err != nil { | ||
| return err | ||
| var clusterNetworks []netip.Prefix | ||
| for _, cn := range hc.Spec.Networking.ClusterNetwork { | ||
| prefix, err := netip.ParsePrefix(cn.CIDR.String()) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| clusterNetworks = append(clusterNetworks, prefix) | ||
| } | ||
|
|
||
| messages := []string{} | ||
| for _, machine := range machines { | ||
| seen := make(map[string]struct{}) | ||
| var conflicting []cidrConflictEntry | ||
| hasAddressOutsideClusterNetwork := false | ||
|
|
||
| for _, addr := range machine.Status.Addresses { | ||
| if addr.Type != capiv1.MachineExternalIP && addr.Type != capiv1.MachineInternalIP { | ||
| continue | ||
| } | ||
|
|
||
| ipaddr, err := netip.ParseAddr(addr.Address) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| if clusterNetwork.Contains(ipaddr) { | ||
| messages = append(messages, fmt.Sprintf("machine [%s] with ip [%s] collides with cluster-network cidr [%s]", machine.Name, addr.Address, clusterNetworkStr)) | ||
| key := ipaddr.String() | ||
| if _, ok := seen[key]; ok { | ||
| continue | ||
| } | ||
| seen[key] = struct{}{} | ||
|
|
||
| if ipaddr.IsLinkLocalUnicast() || ipaddr.IsLinkLocalMulticast() { | ||
| continue | ||
| } | ||
|
|
||
| matchedCIDR := "" | ||
| for _, cn := range clusterNetworks { | ||
| if cn.Contains(ipaddr) { | ||
| matchedCIDR = cn.String() | ||
| break | ||
| } | ||
| } | ||
| if matchedCIDR != "" { | ||
| conflicting = append(conflicting, cidrConflictEntry{ip: addr.Address, cidr: matchedCIDR}) | ||
| } else { | ||
| hasAddressOutsideClusterNetwork = true | ||
| } | ||
| } | ||
|
|
||
| // When a machine has addresses both inside and outside the cluster network, | ||
| // the in-network addresses are typically CNI-internal (e.g. OVN-Kubernetes | ||
| // management port IPs on KubeVirt VMs) and do not represent a real conflict. | ||
| // Only report a conflict when ALL of a machine's addresses fall within the | ||
| // cluster network, which indicates the machine's infrastructure IP genuinely | ||
| // overlaps with the pod CIDR. | ||
| if !hasAddressOutsideClusterNetwork { | ||
| for _, entry := range conflicting { | ||
| messages = append(messages, fmt.Sprintf("machine [%s] with ip [%s] collides with cluster-network cidr [%s]", machine.Name, entry.ip, entry.cidr)) | ||
| } | ||
| } else if len(conflicting) > 0 { | ||
| log.V(4).Info("Skipping CNI-internal addresses for CIDR conflict check", | ||
| "machine", machine.Name, | ||
| "suppressedCount", len(conflicting)) | ||
| } | ||
| } | ||
|
Comment on lines
+818
to
827
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [suggestion] Consider emitting a structured log line at debug level when addresses are suppressed by the heuristic. Operators debugging networking issues on KubeVirt clusters might want to see that addresses were evaluated but deemed CNI-internal: if hasAddressOutsideClusterNetwork && len(conflicting) > 0 {
log.Log.V(4).Info("Skipping CNI-internal addresses for CIDR conflict check",
"machine", machine.Name,
"suppressedCount", len(conflicting))
}This is optional but would help day-2 troubleshooting.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. Added a } else if len(conflicting) > 0 {
log.V(4).Info("Skipping CNI-internal addresses for CIDR conflict check",
"machine", machine.Name,
"suppressedCount", len(conflicting))
}Also added |
||
|
|
||
|
|
@@ -798,6 +847,8 @@ func (r *NodePoolReconciler) setCIDRConflictCondition(nodePool *hyperv1.NodePool | |
| ObservedGeneration: nodePool.Generation, | ||
| } | ||
| SetStatusCondition(&nodePool.Status.Conditions, *cidrConflictCondition) | ||
| } else { | ||
| removeStatusCondition(&nodePool.Status.Conditions, hyperv1.NodePoolClusterNetworkCIDRConflictType) | ||
|
Comment on lines
+850
to
+851
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [blocking] This Please add a test case that:
Example name: Without this test, a regression that breaks condition clearing would go undetected.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. Added test case: It pre-populates the NodePool with an existing |
||
| } | ||
|
|
||
| return nil | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[question] What happens when a machine has zero qualifying addresses — i.e., all addresses are link-local? In that case
hasAddressOutsideClusterNetworkstaysfalseandconflictingis empty, so the!hasAddressOutsideClusterNetworkbranch enters but appends nothing. The behavior is correct (no conflict message), but is a machine with only link-local addresses a valid state? If not, should we flag it differently (e.g., a warning)? If it is valid, please add a test case to document that expectation (see my test comment below).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When a machine has only link-local addresses, both
hasAddressOutsideClusterNetworkandconflictingremain empty, so no conflict is reported and no condition is set. This is correct because link-local addresses are not routable infrastructure IPs and should not influence the heuristic in either direction.Added a test documenting this expectation:
"When machine has only link-local addresses it should not report cidr collision"