From e321d8c8730080b01e1fa9271cd92b4f03b1641a Mon Sep 17 00:00:00 2001 From: Rafael Westphal Date: Fri, 26 Jun 2026 15:41:31 +0000 Subject: [PATCH] fix(tests): retry VM start on stockout in TestRestartVM During nightly E2E integration tests, TestRestartVM frequently flakes when restarting the VM if the zone is out of capacity (stockout), because the underlying gce-testing-internal library treats stockouts as permanent errors and aborts retries. This CL implements a local helper restartInstanceWithRetries in the test suite that performs StopInstance followed by StartInstance inside a custom retry loop, specifically retrying on ZONE_RESOURCE_POOL_EXHAUSTED and other transient capacity errors. --- integration_test/ops_agent_test/main_test.go | 39 +++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/integration_test/ops_agent_test/main_test.go b/integration_test/ops_agent_test/main_test.go index 4448427f6e..59b7f8c4da 100644 --- a/integration_test/ops_agent_test/main_test.go +++ b/integration_test/ops_agent_test/main_test.go @@ -5711,6 +5711,43 @@ func TestPartialSuccess(t *testing.T) { }) } +func restartInstanceWithRetries(ctx context.Context, logger *log.Logger, vm *gce.VM) error { + logger.Printf("Stopping instance %s...", vm.Name) + if err := gce.StopInstance(ctx, logger, vm); err != nil { + return fmt.Errorf("failed to stop instance: %v", err) + } + + maxAttempts := 5 + var lastErr error + for attempt := 1; attempt <= maxAttempts; attempt++ { + logger.Printf("Starting instance %s (attempt %d/%d)...", vm.Name, attempt, maxAttempts) + err := gce.StartInstance(ctx, logger, vm) + if err == nil { + logger.Printf("Instance %s started successfully", vm.Name) + return nil + } + + lastErr = err + errStr := err.Error() + isStockout := strings.Contains(errStr, "ZONE_RESOURCE_POOL_EXHAUSTED") || + strings.Contains(errStr, "currently unavailable") || + strings.Contains(errStr, "resource pool exhausted") + + if isStockout { + logger.Printf("Instance start failed due to stockout: %v. Retrying in 1 minute...", err) + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(1 * time.Minute): + } + } else { + // Non-stockout error, fail immediately. + return fmt.Errorf("failed to start instance (non-stockout error): %v", err) + } + } + return fmt.Errorf("failed to start instance after %d attempts. Last error: %v", maxAttempts, lastErr) +} + func TestRestartVM(t *testing.T) { t.Parallel() gce.RunForEachImage(t, func(t *testing.T, imageSpec string) { @@ -5746,7 +5783,7 @@ func TestRestartVM(t *testing.T) { } logger.Printf(`Restarting instance. For details, see "VM_restart.txt".`) - if err := gce.RestartInstance(ctx, dirLog.ToFile("VM_restart.txt"), vm); err != nil { + if err := restartInstanceWithRetries(ctx, dirLog.ToFile("VM_restart.txt"), vm); err != nil { t.Fatal(err) }