Skip to content

Commit 5b4e2f1

Browse files
committed
attempting to fix race conditions part 2
1 parent 8224fde commit 5b4e2f1

1 file changed

Lines changed: 20 additions & 8 deletions

File tree

internal/cloning/cloning_service.go

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,23 @@ func (cs *CloningService) CloneTemplate(req CloneRequest) error {
270270
// Release the vmid allocation mutex now that all of the VMs are cloned on proxmox
271271
cs.vmidMutex.Unlock()
272272

273-
// 9. Configure VNet of all VMs
273+
// 9. Wait for all router disks to be fully available before configuring VNets.
274+
// Proxmox clone is two-phase: the clone lock (Phase 1) releases before the storage
275+
// backend finishes writing the disk (Phase 2). If SetPodVnet runs before Phase 2
276+
// completes, Proxmox's disk finalization can overwrite the net1 config change,
277+
// leaving the router connected to the wrong vnet.
278+
log.Printf("Waiting for router disks to be available before configuring VNets")
279+
routerDiskReady := make(map[int]bool)
280+
for _, routerInfo := range clonedRouters {
281+
log.Printf("Waiting for router disk to be available for %s (VMID: %d)", routerInfo.TargetName, routerInfo.VMID)
282+
if err := cs.ProxmoxService.WaitForDisk(routerInfo.Node, routerInfo.VMID, cs.Config.RouterWaitTimeout); err != nil {
283+
errors = append(errors, fmt.Sprintf("router disk unavailable for %s: %v", routerInfo.TargetName, err))
284+
} else {
285+
routerDiskReady[routerInfo.VMID] = true
286+
}
287+
}
288+
289+
// 10. Configure VNet of all VMs
274290
log.Printf("Configuring VNets for %d targets", len(req.Targets))
275291
for _, target := range req.Targets {
276292
vnetName := fmt.Sprintf("kamino%d", target.PodNumber)
@@ -281,7 +297,7 @@ func (cs *CloningService) CloneTemplate(req CloneRequest) error {
281297
}
282298
}
283299

284-
// 10. Start all routers and wait for them to be running
300+
// 11. Start all routers and wait for them to be running
285301
req.SSE.Send(
286302
ProgressMessage{
287303
Message: "Starting routers",
@@ -290,11 +306,7 @@ func (cs *CloningService) CloneTemplate(req CloneRequest) error {
290306
)
291307
log.Printf("Starting %d routers", len(clonedRouters))
292308
for _, routerInfo := range clonedRouters {
293-
// Wait for router disk to be available
294-
log.Printf("Waiting for router disk to be available for %s (VMID: %d)", routerInfo.TargetName, routerInfo.VMID)
295-
err = cs.ProxmoxService.WaitForDisk(routerInfo.Node, routerInfo.VMID, cs.Config.RouterWaitTimeout)
296-
if err != nil {
297-
errors = append(errors, fmt.Sprintf("router disk unavailable for %s: %v", routerInfo.TargetName, err))
309+
if !routerDiskReady[routerInfo.VMID] {
298310
continue
299311
}
300312

@@ -314,7 +326,7 @@ func (cs *CloningService) CloneTemplate(req CloneRequest) error {
314326
}
315327
}
316328

317-
// 11. Configure all pod routers (separate step after all routers are running)
329+
// 12. Configure all pod routers (separate step after all routers are running)
318330
req.SSE.Send(
319331
ProgressMessage{
320332
Message: "Configuring pod routers",

0 commit comments

Comments
 (0)