Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 66 additions & 5 deletions pkg/types/vsphere/validation/platform.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package validation
import (
"fmt"
"net"
"path/filepath"
"path"
"regexp"
"strings"

Expand Down Expand Up @@ -163,6 +163,11 @@ func validateFailureDomains(p *vsphere.Platform, platformFldPath *field.Path, fl
var associatedVCenter *vsphere.VCenter

zoneNames := make(map[string]string)
type topologyEntry struct {
name string
networks []string
}
fdTopologies := make(map[vsphereTopologyKey][]topologyEntry)

for index, failureDomain := range p.FailureDomains {
if regionName, ok := zoneNames[failureDomain.Zone]; !ok {
Expand All @@ -171,6 +176,20 @@ func validateFailureDomains(p *vsphere.Platform, platformFldPath *field.Path, fl
allErrs = append(allErrs, field.Invalid(fldPath.Child("zone"), failureDomain.Zone, fmt.Sprintf("cannot be used more than once for the failure domain region %q", failureDomain.Region)))
}

topoKey := normalizedTopologyKey(failureDomain)
duplicate := false
for _, entry := range fdTopologies[topoKey] {
if slices.Equal(failureDomain.Topology.Networks, entry.networks) {
allErrs = append(allErrs, field.Invalid(fldPath.Index(index), failureDomain.Name,
fmt.Sprintf("failure domain %q has identical topology (same server, datacenter, computeCluster, datastore, networks, resourcePool, hostGroup) as %q; this provides no additional fault tolerance", failureDomain.Name, entry.name)))
duplicate = true
break
}
}
if !duplicate {
fdTopologies[topoKey] = append(fdTopologies[topoKey], topologyEntry{name: failureDomain.Name, networks: failureDomain.Topology.Networks})
}

if failureDomain.ZoneType == "" && failureDomain.RegionType == "" {
logrus.Debug("using the defaults regionType is Datacenter and zoneType is ComputeCluster")
}
Expand Down Expand Up @@ -249,7 +268,7 @@ func validateFailureDomains(p *vsphere.Platform, platformFldPath *field.Path, fl
if !strings.Contains(failureDomain.Topology.Datastore, failureDomain.Topology.Datacenter) {
return append(allErrs, field.Invalid(topologyFld.Child("datastore"), failureDomain.Topology.Datastore, "the datastore defined does not exist in the correct datacenter"))
}
p.FailureDomains[index].Topology.Datastore = filepath.Clean(p.FailureDomains[index].Topology.Datastore)
p.FailureDomains[index].Topology.Datastore = path.Clean(p.FailureDomains[index].Topology.Datastore)
}

if len(failureDomain.Topology.TagIDs) > 10 {
Expand Down Expand Up @@ -304,7 +323,7 @@ func validateFailureDomains(p *vsphere.Platform, platformFldPath *field.Path, fl
if len(failureDomain.Topology.Datacenter) != 0 && !strings.Contains(failureDomain.Topology.Datacenter, datacenterName) {
return append(allErrs, field.Invalid(topologyFld.Child("computeCluster"), computeCluster, fmt.Sprintf("compute cluster must be in datacenter %s", failureDomain.Topology.Datacenter)))
}
p.FailureDomains[index].Topology.ComputeCluster = filepath.Clean(p.FailureDomains[index].Topology.ComputeCluster)
p.FailureDomains[index].Topology.ComputeCluster = path.Clean(p.FailureDomains[index].Topology.ComputeCluster)
}

if len(failureDomain.Topology.ResourcePool) != 0 {
Expand All @@ -323,7 +342,7 @@ func validateFailureDomains(p *vsphere.Platform, platformFldPath *field.Path, fl
return append(allErrs, field.Invalid(topologyFld.Child("resourcePool"), resourcePool, fmt.Sprintf("resource pool must be in compute cluster %s", failureDomain.Topology.ComputeCluster)))
}

p.FailureDomains[index].Topology.ResourcePool = filepath.Clean(p.FailureDomains[index].Topology.ResourcePool)
p.FailureDomains[index].Topology.ResourcePool = path.Clean(p.FailureDomains[index].Topology.ResourcePool)
}

// Validate that template and clusterOSImage are mutually exclusive
Expand All @@ -333,13 +352,55 @@ func validateFailureDomains(p *vsphere.Platform, platformFldPath *field.Path, fl
}

if len(failureDomain.Topology.Template) > 0 {
p.FailureDomains[index].Topology.Template = filepath.Clean(p.FailureDomains[index].Topology.Template)
p.FailureDomains[index].Topology.Template = path.Clean(p.FailureDomains[index].Topology.Template)
}
}

return allErrs
}

// vsphereTopologyKey is a comparable struct representing the scalar physical
// infrastructure fields of a failure domain. Used as a map key for first-pass
// dedup. Networks are compared separately (order-sensitive slice comparison)
// because Go map keys cannot contain slices.
//
// Included fields (define physical placement):
// - server, datacenter, computeCluster, datastore, resourcePool, hostGroup
//
// Excluded fields (identity/metadata, not infrastructure):
// - Name, Region, Zone, RegionType, ZoneType (scheduling labels)
// - Folder, Template, TagIDs (VM metadata, not fault-zone identity)
// - Networks (compared separately via slices.Equal, order matters)
type vsphereTopologyKey struct {
server string
datacenter string
computeCluster string
datastore string
resourcePool string
hostGroup string
}

// normalizedTopologyKey builds a comparable struct key from a failure domain.
// Inventory paths are canonicalized with path.Clean (vSphere paths are URL-style,
// always use forward slashes).
func normalizedTopologyKey(fd vsphere.FailureDomain) vsphereTopologyKey {
normalizePath := func(p string) string {
if p == "" {
return ""
}
return path.Clean(p)
}

return vsphereTopologyKey{
server: fd.Server,
datacenter: fd.Topology.Datacenter,
computeCluster: normalizePath(fd.Topology.ComputeCluster),
datastore: normalizePath(fd.Topology.Datastore),
resourcePool: normalizePath(fd.Topology.ResourcePool),
hostGroup: fd.Topology.HostGroup,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you won't be able to use hostGroup when the topology is for vm-host zonal.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jcpowermac Could you clarify your comment? I want to make sure I understand correctly.

Currently, hostGroup is unconditionally included in the topology key. Are you suggesting:

It should only be included when ZoneType == HostGroup (for vm-host zonal), OR
There's a different issue with the current implementation?
My understanding from the enhancement doc is that vm-host zonal does use hostGroup to distinguish zones within a stretched cluster.

}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}

// validateDiskType checks that the specified diskType is valid.
func validateDiskType(p *vsphere.Platform, fldPath *field.Path) field.ErrorList {
allErrs := field.ErrorList{}
Expand Down
141 changes: 140 additions & 1 deletion pkg/types/vsphere/validation/platform_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ func TestValidatePlatform(t *testing.T) {

for i := range p.FailureDomains {
p.FailureDomains[i].Topology.ComputeCluster = "/test-datacenter/host/HostClusterFolder/test-cluster"
p.FailureDomains[i].Topology.ResourcePool = "/test-datacenter/host/HostClusterFolder/test-cluster/Resources/test-resourcepool"
p.FailureDomains[i].Topology.ResourcePool = fmt.Sprintf("/test-datacenter/host/HostClusterFolder/test-cluster/Resources/test-resourcepool-%d", i)
p.FailureDomains[i].Topology.Datastore = "/test-datacenter/datastore/StorageFolder/test-datastore"
p.FailureDomains[i].Topology.Folder = "/test-datacenter/vm/VMFolder/test-folder"
}
Expand Down Expand Up @@ -552,6 +552,145 @@ func TestValidatePlatform(t *testing.T) {
}(),
expectedError: `test-path\.failureDomains\.name\[1\]: Duplicate value: "test-east-1a"`,
},
{
name: "Multi-zone platform failureDomain duplicate topology",
platform: func() *vsphere.Platform {
p := validPlatform()
p.FailureDomains[1].Topology = p.FailureDomains[0].Topology
p.FailureDomains[1].Server = p.FailureDomains[0].Server
return p
}(),
expectedError: `test-path\.failureDomains\[1\]: Invalid value: "test-east-2a": failure domain "test-east-2a" has identical topology .* as "test-east-1a"; this provides no additional fault tolerance`,
},
Comment thread
coderabbitai[bot] marked this conversation as resolved.
{
name: "Valid HostGroup failure domains with same topology but different HostGroups",
platform: func() *vsphere.Platform {
p := validPlatform()
p.FailureDomains[0].RegionType = vsphere.ComputeClusterFailureDomain
p.FailureDomains[0].ZoneType = vsphere.HostGroupFailureDomain
p.FailureDomains[0].Topology.HostGroup = "host-group-a"
p.FailureDomains[0].Topology.ResourcePool = "/test-datacenter/host/test-cluster/Resources/test-resourcepool"

p.FailureDomains[1].Server = p.FailureDomains[0].Server
p.FailureDomains[1].Topology = p.FailureDomains[0].Topology
p.FailureDomains[1].RegionType = vsphere.ComputeClusterFailureDomain
p.FailureDomains[1].ZoneType = vsphere.HostGroupFailureDomain
p.FailureDomains[1].Topology.HostGroup = "host-group-b"
return p
}(),
},
{
name: "Duplicate topology with same networks same order (multi-net)",
platform: func() *vsphere.Platform {
p := validPlatform()
p.FailureDomains[0].Topology.Networks = []string{"net-a", "net-b"}
p.FailureDomains[1].Topology = p.FailureDomains[0].Topology
p.FailureDomains[1].Server = p.FailureDomains[0].Server
return p
}(),
expectedError: `test-path\.failureDomains\[1\]: Invalid value: "test-east-2a": failure domain "test-east-2a" has identical topology .* as "test-east-1a"; this provides no additional fault tolerance`,
},
{
name: "Valid: same networks different order (multi-net, order matters)",
platform: func() *vsphere.Platform {
p := validPlatform()
p.FailureDomains[0].Topology.Networks = []string{"net-a", "net-b"}
p.FailureDomains[1].Server = p.FailureDomains[0].Server
p.FailureDomains[1].Topology = p.FailureDomains[0].Topology
p.FailureDomains[1].Topology.Networks = []string{"net-b", "net-a"}
return p
}(),
},
{
name: "Valid: different networks (multi-net)",
platform: func() *vsphere.Platform {
p := validPlatform()
p.FailureDomains[0].Topology.Networks = []string{"net-a", "net-b"}
p.FailureDomains[1].Server = p.FailureDomains[0].Server
p.FailureDomains[1].Topology = p.FailureDomains[0].Topology
p.FailureDomains[1].Topology.Networks = []string{"net-a", "net-c"}
return p
}(),
},
{
name: "Valid: subset networks (different length)",
platform: func() *vsphere.Platform {
p := validPlatform()
p.FailureDomains[0].Topology.Networks = []string{"net-a"}
p.FailureDomains[1].Server = p.FailureDomains[0].Server
p.FailureDomains[1].Topology = p.FailureDomains[0].Topology
p.FailureDomains[1].Topology.Networks = []string{"net-a", "net-b"}
return p
}(),
},
{
name: "Duplicate topology with path normalization (trailing slash)",
platform: func() *vsphere.Platform {
p := validPlatform()
p.FailureDomains[1].Server = p.FailureDomains[0].Server
p.FailureDomains[1].Topology = p.FailureDomains[0].Topology
p.FailureDomains[1].Topology.ComputeCluster = "/test-datacenter/host/test-cluster/"
return p
}(),
expectedError: `test-path\.failureDomains\[1\]: Invalid value: "test-east-2a": failure domain "test-east-2a" has identical topology .* as "test-east-1a"; this provides no additional fault tolerance`,
},
{
name: "Duplicate topology with path normalization (double slash)",
platform: func() *vsphere.Platform {
p := validPlatform()
p.FailureDomains[1].Server = p.FailureDomains[0].Server
p.FailureDomains[1].Topology = p.FailureDomains[0].Topology
p.FailureDomains[1].Topology.Datastore = "/test-datacenter/datastore//test-datastore"
return p
}(),
expectedError: `test-path\.failureDomains\[1\]: Invalid value: "test-east-2a": failure domain "test-east-2a" has identical topology .* as "test-east-1a"; this provides no additional fault tolerance`,
},
{
name: "Valid: different datastore (not duplicate)",
platform: func() *vsphere.Platform {
p := validPlatform()
p.FailureDomains[1].Server = p.FailureDomains[0].Server
p.FailureDomains[1].Topology = p.FailureDomains[0].Topology
p.FailureDomains[1].Topology.Datastore = "/test-datacenter/datastore/other-datastore"
return p
}(),
},
{
name: "Valid: different server (not duplicate)",
platform: func() *vsphere.Platform {
p := validPlatform()
p.FailureDomains[1].Topology = p.FailureDomains[0].Topology
p.FailureDomains[1].Server = "other-vcenter"
return p
}(),
},
{
name: "Duplicate topology HostGroup zonal same hostGroup",
platform: func() *vsphere.Platform {
p := validPlatform()
p.FailureDomains[0].RegionType = vsphere.ComputeClusterFailureDomain
p.FailureDomains[0].ZoneType = vsphere.HostGroupFailureDomain
p.FailureDomains[0].Topology.HostGroup = "host-group-a"
p.FailureDomains[0].Topology.ResourcePool = "/test-datacenter/host/test-cluster/Resources/test-resourcepool"

p.FailureDomains[1].Server = p.FailureDomains[0].Server
p.FailureDomains[1].Topology = p.FailureDomains[0].Topology
p.FailureDomains[1].RegionType = vsphere.ComputeClusterFailureDomain
p.FailureDomains[1].ZoneType = vsphere.HostGroupFailureDomain
return p
}(),
expectedError: `test-path\.failureDomains\[1\]: Invalid value: "test-east-2a": failure domain "test-east-2a" has identical topology .* as "test-east-1a"; this provides no additional fault tolerance`,
},
{
name: "Valid: different resourcePool (not duplicate)",
platform: func() *vsphere.Platform {
p := validPlatform()
p.FailureDomains[1].Server = p.FailureDomains[0].Server
p.FailureDomains[1].Topology = p.FailureDomains[0].Topology
p.FailureDomains[1].Topology.ResourcePool = "/test-datacenter/host/test-cluster/Resources/other-pool"
return p
}(),
},
Comment on lines +582 to +693
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major | ⚡ Quick win

Add test coverage for HostGroup evasion on non-HostGroup zones.

The test suite comprehensively covers network ordering, path normalization, and HostGroup-zone scenarios. However, there's no test case for two non-HostGroup failure domains (e.g., ZoneType is empty/default or ComputeCluster) with identical topology but different Topology.HostGroup values. This scenario should be detected as a duplicate (since HostGroup is not topology-defining for non-HostGroup zones), but the current implementation at line 400 of platform.go would treat them as distinct.

🧪 Proposed test case (should expect error after fixing platform.go line 400)
{
	name: "Duplicate topology: non-HostGroup zones with different HostGroup values (evasion attempt)",
	platform: func() *vsphere.Platform {
		p := validPlatform()
		// Both are default ComputeCluster zones (not HostGroup zones)
		p.FailureDomains[0].Topology.HostGroup = "should-be-ignored-a"
		p.FailureDomains[1].Server = p.FailureDomains[0].Server
		p.FailureDomains[1].Topology = p.FailureDomains[0].Topology
		p.FailureDomains[1].Topology.HostGroup = "should-be-ignored-b"
		return p
	}(),
	// Should produce an error because HostGroup should not participate in dedup for non-HostGroup zones
	expectedError: `test-path\.failureDomains\[1\]: Invalid value: "test-east-2a": failure domain "test-east-2a" has identical topology .* as "test-east-1a"; this provides no additional fault tolerance`,
},

Note: This test will fail with the current implementation until the fix suggested in platform.go (conditional HostGroup inclusion) is applied.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@pkg/types/vsphere/validation/platform_test.go` around lines 582 - 693, The
duplicate-topology check currently includes Topology.HostGroup unconditionally,
allowing an evasion when two non-HostGroup failure domains differ only by
HostGroup; update the topology comparison in the duplicate-detection logic (the
function in platform.go that normalizes/compares FailureDomain.Topology used to
validate failure domains) so that Topology.HostGroup is only considered when
both FailureDomain.ZoneType == vsphere.HostGroupFailureDomain (otherwise ignore
HostGroup for comparison), and ensure this change respects ZoneType values like
vsphere.ComputeClusterFailureDomain and vsphere.HostGroupFailureDomain when
building the topology key for duplicate detection.

{
name: "Multi-zone platform failureDomain zone missing tag category",
platform: func() *vsphere.Platform {
Expand Down