From 842e5371d863f5d605bf50e488f9282549cfd55e Mon Sep 17 00:00:00 2001 From: yasserfaraazkhan Date: Sun, 5 Apr 2026 09:05:51 +0530 Subject: [PATCH 1/8] Modified handleE2ECleanup + added cleanupOrphanedE2EInstances --- server/e2e_tests.go | 48 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/server/e2e_tests.go b/server/e2e_tests.go index 58eb08d..6355035 100644 --- a/server/e2e_tests.go +++ b/server/e2e_tests.go @@ -490,7 +490,7 @@ func (s *Server) triggerMobileE2EWorkflow(ctx context.Context, client *github.Cl return nil } -// handleE2ECleanup destroys all E2E instances for a PR +// handleE2ECleanup destroys tracked E2E instances, then queries the cloud API by DNS pattern to catch orphans. func (s *Server) handleE2ECleanup(pr *model.PullRequest) { logger := s.Logger.WithFields(logrus.Fields{ "repo": pr.RepoName, @@ -499,20 +499,56 @@ func (s *Server) handleE2ECleanup(pr *model.PullRequest) { }) logger.Info("Handling E2E cleanup request") - // Retrieve and remove instances from tracking + // Fast path: in-memory map key := fmt.Sprintf("%s-pr-%d", pr.RepoName, pr.Number) s.e2eInstancesLock.Lock() instances := s.e2eInstances[key] delete(s.e2eInstances, key) s.e2eInstancesLock.Unlock() - if len(instances) == 0 { - logger.Warn("No E2E instances found for cleanup") + if len(instances) > 0 { + logger.WithField("instances", len(instances)).Info("Destroying tracked E2E instances") + s.destroyE2EInstances(instances, logger) + } + + // Fallback: catch orphans from restarts, map overwrites, or failed goroutines + s.cleanupOrphanedE2EInstances(pr, logger) +} + +// cleanupOrphanedE2EInstances queries the cloud API by DNS LIKE pattern and destroys any matches. +func (s *Server) cleanupOrphanedE2EInstances(pr *model.PullRequest, logger logrus.FieldLogger) { + instanceType := "mobile" + if strings.Contains(pr.RepoName, "desktop") { + instanceType = "desktop" + } + + dnsPattern := fmt.Sprintf("%s-pr-%d-%%", instanceType, pr.Number) // e.g. "mobile-pr-9587-%" + + installations, err := s.CloudClient.GetInstallations(&cloudModel.GetInstallationsRequest{ + DNS: dnsPattern, + Paging: cloudModel.Paging{ + Page: 0, + PerPage: 50, + }, + }) + if err != nil { + logger.WithError(err).Error("Failed to query cloud API for orphaned E2E instances") return } - logger.WithField("instances", len(instances)).Info("Destroying E2E instances") - s.destroyE2EInstances(instances, logger) + if len(installations) == 0 { + logger.Debug("No orphaned E2E instances found via cloud API") + return + } + + logger.WithField("orphans", len(installations)).Warn("Found orphaned E2E instances via cloud API") + for _, inst := range installations { + instLogger := logger.WithField("installation_id", inst.ID) + instLogger.Info("Destroying orphaned E2E instance") + if err := s.CloudClient.DeleteInstallation(inst.ID); err != nil { + instLogger.WithError(err).Error("Failed to destroy orphaned E2E instance") + } + } } // destroyE2EInstances destroys all given E2E instances From 4bf16d6cc9a9fb69a2853cf58997a156d9be8766 Mon Sep 17 00:00:00 2001 From: yasserfaraazkhan Date: Sun, 5 Apr 2026 09:15:47 +0530 Subject: [PATCH 2/8] fix review --- server/e2e_tests.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/server/e2e_tests.go b/server/e2e_tests.go index 6355035..2dcf7d4 100644 --- a/server/e2e_tests.go +++ b/server/e2e_tests.go @@ -525,11 +525,8 @@ func (s *Server) cleanupOrphanedE2EInstances(pr *model.PullRequest, logger logru dnsPattern := fmt.Sprintf("%s-pr-%d-%%", instanceType, pr.Number) // e.g. "mobile-pr-9587-%" installations, err := s.CloudClient.GetInstallations(&cloudModel.GetInstallationsRequest{ - DNS: dnsPattern, - Paging: cloudModel.Paging{ - Page: 0, - PerPage: 50, - }, + DNS: dnsPattern, + Paging: cloudModel.AllPagesNotDeleted(), }) if err != nil { logger.WithError(err).Error("Failed to query cloud API for orphaned E2E instances") From 9491d74b32120060abc4461470457f2c53f89ef9 Mon Sep 17 00:00:00 2001 From: yasserfaraazkhan Date: Fri, 10 Apr 2026 10:03:54 +0530 Subject: [PATCH 3/8] allow single set of servers to be provisioned and used as long as PR is open. Destroy once the PR is merged --- server/e2e_tests.go | 171 +++++++++++++++++++++++++++++++++-------- server/pull_request.go | 4 +- 2 files changed, 140 insertions(+), 35 deletions(-) diff --git a/server/e2e_tests.go b/server/e2e_tests.go index 2dcf7d4..5945a1b 100644 --- a/server/e2e_tests.go +++ b/server/e2e_tests.go @@ -9,6 +9,7 @@ import ( "fmt" "net/url" "os" + "sort" "strings" "time" @@ -71,43 +72,19 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) { }) logger.Info("Handling E2E test request") - // Check if there's already an E2E run in progress for this PR - // If yes, cancel it before starting a new one - key := fmt.Sprintf("%s-pr-%d", pr.RepoName, pr.Number) - s.e2eInstancesLock.Lock() - existingInstances, hasExisting := s.e2eInstances[key] - if hasExisting { - logger.WithField("existingInstances", len(existingInstances)).Info("Found existing E2E run, canceling it") - // Remove from tracking immediately to prevent race conditions - delete(s.e2eInstances, key) - s.e2eInstancesLock.Unlock() - - // Destroy old instances in background - go s.destroyE2EInstances(existingInstances, logger) - - // Also attempt to cancel the GitHub workflow run - go s.cancelPRWorkflowRuns(pr, logger) - } else { - s.e2eInstancesLock.Unlock() - } - - // Determine instance type based on repository + // Determine instance type and platforms first — needed for both reuse lookup and creation. var instanceType string var platforms []string var testPlatform string // For mobile: which OS to test (ios/android/both). For desktop: unused (tests all OS platforms) if strings.Contains(pr.RepoName, "desktop") { instanceType = "desktop" - // Desktop: platforms = OS runners (linux/macos/windows) - // Desktop tests run on all OS platforms automatically platforms = []string{"linux", "macos", "windows"} - testPlatform = "all" // Desktop always tests all OS platforms (linux/macos/windows) + testPlatform = "all" } else if strings.Contains(pr.RepoName, "mobile") { instanceType = "mobile" - // Mobile: platforms = server instances (site-1/site-2/site-3) // Always create all 3 mobile instances (workflow expects SITE_1/2/3_URL). platforms = []string{"site-1", "site-2", "site-3"} - // Mobile: testPlatform = which mobile OS to test (ios/android/both) testPlatform = s.extractPlatformFromLabel(label) logger.WithField("testPlatform", testPlatform).Info("Detected mobile test platform from label (ios/android/both)") } else { @@ -115,7 +92,40 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) { return } - // Create multiple instances + key := fmt.Sprintf("%s-pr-%d", pr.RepoName, pr.Number) + + // 1. Reuse existing in-memory instances (servers stay alive between label toggles). + s.e2eInstancesLock.Lock() + existingInstances := s.e2eInstances[key] + s.e2eInstancesLock.Unlock() + + if len(existingInstances) > 0 { + logger.WithField("instances", len(existingInstances)).Info("Reusing existing in-memory E2E instances") + go s.cancelPRWorkflowRuns(pr, logger) + s.wakeUpHibernatingInstances(existingInstances, logger) + if err := s.triggerE2EWorkflow(pr, existingInstances, instanceType, testPlatform); err != nil { + logger.WithError(err).Error("Failed to trigger E2E workflow with existing instances") + s.postE2EErrorComment(pr, fmt.Sprintf("Failed to trigger E2E workflow: %v", err)) + } + return + } + + // 2. Check cloud API for instances that survived a matterwick restart. + if cloudInstances, err := s.findExistingE2EInstancesInCloud(pr, instanceType, platforms); err == nil && len(cloudInstances) == len(platforms) { + logger.WithField("instances", len(cloudInstances)).Info("Reusing existing cloud E2E instances") + go s.cancelPRWorkflowRuns(pr, logger) + s.wakeUpHibernatingInstances(cloudInstances, logger) + s.e2eInstancesLock.Lock() + s.e2eInstances[key] = cloudInstances + s.e2eInstancesLock.Unlock() + if err := s.triggerE2EWorkflow(pr, cloudInstances, instanceType, testPlatform); err != nil { + logger.WithError(err).Error("Failed to trigger E2E workflow with cloud instances") + s.postE2EErrorComment(pr, fmt.Sprintf("Failed to trigger E2E workflow: %v", err)) + } + return + } + + // 3. No existing instances — create fresh ones. instances, err := s.createMultipleE2EInstances(pr, instanceType, platforms) if err != nil { logger.WithError(err).Error("Failed to create E2E instances") @@ -129,23 +139,19 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) { return } - // Store instances for later cleanup (reuse key variable from above) s.e2eInstancesLock.Lock() s.e2eInstances[key] = instances s.e2eInstancesLock.Unlock() logger.WithField("instances", len(instances)).Info("Successfully created E2E instances") - // Trigger the appropriate workflow - err = s.triggerE2EWorkflow(pr, instances, instanceType, testPlatform) // Pass testPlatform: "all" for desktop, "ios"/"android"/"both" for mobile - if err != nil { + if err = s.triggerE2EWorkflow(pr, instances, instanceType, testPlatform); err != nil { logger.WithError(err).Error("Failed to trigger E2E workflow") s.postE2EErrorComment(pr, fmt.Sprintf("Failed to trigger E2E workflow: %v", err)) - // Remove instances from tracking map before cleanup to avoid double-destroy on later cleanup. + // Remove from tracking before cleanup to avoid double-destroy on later cleanup. s.e2eInstancesLock.Lock() delete(s.e2eInstances, key) s.e2eInstancesLock.Unlock() - // Attempt cleanup on workflow trigger failure s.destroyE2EInstances(instances, logger) return } @@ -540,6 +546,16 @@ func (s *Server) cleanupOrphanedE2EInstances(pr *model.PullRequest, logger logru logger.WithField("orphans", len(installations)).Warn("Found orphaned E2E instances via cloud API") for _, inst := range installations { + // Skip instances already progressing through deletion to avoid redundant API calls. + if inst.State == cloudModel.InstallationStateDeletionPendingRequested || + inst.State == cloudModel.InstallationStateDeletionPendingInProgress || + inst.State == cloudModel.InstallationStateDeletionPending || + inst.State == cloudModel.InstallationStateDeletionRequested || + inst.State == cloudModel.InstallationStateDeletionFailed || + inst.State == cloudModel.InstallationStateDeleted { + logger.WithField("installation_id", inst.ID).Debug("Skipping E2E instance already in deletion state") + continue + } instLogger := logger.WithField("installation_id", inst.ID) instLogger.Info("Destroying orphaned E2E instance") if err := s.CloudClient.DeleteInstallation(inst.ID); err != nil { @@ -564,6 +580,95 @@ func (s *Server) destroyE2EInstances(instances []*E2EInstance, logger logrus.Fie } } +// findExistingE2EInstancesInCloud queries the cloud API for E2E instances that match a PR and +// reconstructs E2EInstance objects for reuse. Returns an error if the count doesn't match +// the expected number of platforms (indicating a partial or fully absent set). +func (s *Server) findExistingE2EInstancesInCloud(pr *model.PullRequest, instanceType string, platforms []string) ([]*E2EInstance, error) { + dnsPattern := fmt.Sprintf("%s-pr-%d-%%", instanceType, pr.Number) + + installations, err := s.CloudClient.GetInstallations(&cloudModel.GetInstallationsRequest{ + DNS: dnsPattern, + Paging: cloudModel.AllPagesNotDeleted(), + }) + if err != nil { + return nil, fmt.Errorf("failed to query cloud API for existing E2E instances: %w", err) + } + + // Only reuse instances that are in a stable, usable state. + var reusable []*cloudModel.InstallationDTO + for _, inst := range installations { + if inst.State == cloudModel.InstallationStateStable || + inst.State == cloudModel.InstallationStateHibernating { + reusable = append(reusable, inst) + } + } + + if len(reusable) != len(platforms) { + return nil, fmt.Errorf("found %d reusable instances, expected %d", len(reusable), len(platforms)) + } + + // Sort by OwnerID for consistent platform assignment across calls. + // desktop: linux < macos < windows; mobile: site-1 < site-2 < site-3 + sort.Slice(reusable, func(i, j int) bool { + return reusable[i].OwnerID < reusable[j].OwnerID + }) + + result := make([]*E2EInstance, len(reusable)) + for i, inst := range reusable { + platform := platforms[i] + // Reconstruct URL from OwnerID + DNSNameTestServer to avoid needing DNS records in list response. + e2eInst := &E2EInstance{ + Name: inst.OwnerID, + Platform: platform, + URL: fmt.Sprintf("https://%s.%s", inst.OwnerID, s.Config.DNSNameTestServer), + InstallationID: inst.ID, + ServerVersion: inst.Version, + } + if instanceType == "desktop" { + e2eInst.Runner = getRunnerForPlatform(platform) + } + result[i] = e2eInst + } + return result, nil +} + +// wakeUpHibernatingInstances checks each instance and wakes any that are hibernating, +// waiting up to 10 minutes for stable state. Logs warnings on failure and proceeds. +func (s *Server) wakeUpHibernatingInstances(instances []*E2EInstance, logger logrus.FieldLogger) { + for _, inst := range instances { + installation, err := s.CloudClient.GetInstallation(inst.InstallationID, nil) + if err != nil { + logger.WithError(err).WithField("installation_id", inst.InstallationID).Warn("Failed to check installation state before wake-up") + continue + } + if installation.State != cloudModel.InstallationStateHibernating { + continue + } + logger.WithField("installation_id", inst.InstallationID).Info("Waking up hibernating E2E instance") + if _, err := s.CloudClient.WakeupInstallation(inst.InstallationID, nil); err != nil { + logger.WithError(err).WithField("installation_id", inst.InstallationID).Warn("Failed to wake up hibernating E2E instance") + continue + } + timeout := time.Now().Add(10 * time.Minute) + for { + updated, err := s.CloudClient.GetInstallation(inst.InstallationID, nil) + if err != nil { + logger.WithError(err).WithField("installation_id", inst.InstallationID).Warn("Error polling installation state during wake-up") + break + } + if updated.State == cloudModel.InstallationStateStable { + logger.WithField("installation_id", inst.InstallationID).Info("Hibernating E2E instance is now stable") + break + } + if time.Now().After(timeout) { + logger.WithField("installation_id", inst.InstallationID).Warn("Timeout waiting for E2E instance to wake up") + break + } + time.Sleep(15 * time.Second) + } + } +} + // postE2EStartedComment posts a comment when E2E tests start func (s *Server) postE2EStartedComment(pr *model.PullRequest, instances []*E2EInstance) { ctx := context.Background() diff --git a/server/pull_request.go b/server/pull_request.go index c3255b3..68d5bfb 100644 --- a/server/pull_request.go +++ b/server/pull_request.go @@ -65,8 +65,8 @@ func (s *Server) handlePullRequestEvent(event *github.PullRequestEvent) { return } if s.isE2ELabel(label) { - logger.WithField("label", label).Info("PR E2E test label was removed") - go s.handleE2ECleanup(pr) + logger.WithField("label", label).Info("PR E2E test label was removed, canceling workflow runs but keeping servers alive") + go s.cancelPRWorkflowRuns(pr, logger) return } if s.isSpinWickLabel(label) { From 0670863d07e8c65b9741fcec9237bbf728b7d1cf Mon Sep 17 00:00:00 2001 From: yasserfaraazkhan Date: Fri, 10 Apr 2026 10:23:28 +0530 Subject: [PATCH 4/8] add server settings --- server/e2e_tests.go | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/server/e2e_tests.go b/server/e2e_tests.go index 5945a1b..26c48e8 100644 --- a/server/e2e_tests.go +++ b/server/e2e_tests.go @@ -101,7 +101,7 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) { if len(existingInstances) > 0 { logger.WithField("instances", len(existingInstances)).Info("Reusing existing in-memory E2E instances") - go s.cancelPRWorkflowRuns(pr, logger) + s.cancelPRWorkflowRuns(pr, logger) s.wakeUpHibernatingInstances(existingInstances, logger) if err := s.triggerE2EWorkflow(pr, existingInstances, instanceType, testPlatform); err != nil { logger.WithError(err).Error("Failed to trigger E2E workflow with existing instances") @@ -113,7 +113,7 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) { // 2. Check cloud API for instances that survived a matterwick restart. if cloudInstances, err := s.findExistingE2EInstancesInCloud(pr, instanceType, platforms); err == nil && len(cloudInstances) == len(platforms) { logger.WithField("instances", len(cloudInstances)).Info("Reusing existing cloud E2E instances") - go s.cancelPRWorkflowRuns(pr, logger) + s.cancelPRWorkflowRuns(pr, logger) s.wakeUpHibernatingInstances(cloudInstances, logger) s.e2eInstancesLock.Lock() s.e2eInstances[key] = cloudInstances @@ -218,14 +218,19 @@ func (s *Server) createMultipleE2EInstances(pr *model.PullRequest, instanceType func (s *Server) createCloudInstallation(name, version, username, password, instanceType string, logger logrus.FieldLogger) (*E2EInstance, error) { // Create installation request envVars := cloudModel.EnvVarMap{ - "MM_SERVICESETTINGS_ENABLETUTORIAL": cloudModel.EnvVar{Value: "false"}, - "MM_SERVICESETTINGS_ENABLEONBOARDINGFLOW": cloudModel.EnvVar{Value: "false"}, - "MM_SERVICEENVIRONMENT": cloudModel.EnvVar{Value: "test"}, - } - - // Enable automatic replies for mobile E2E tests - if instanceType == "mobile" { - envVars["MM_TEAMSETTINGS_EXPERIMENTALENABLEAUTOMATICREPLIES"] = cloudModel.EnvVar{Value: "true"} + "MM_SERVICESETTINGS_ENABLETUTORIAL": cloudModel.EnvVar{Value: "false"}, + "MM_SERVICESETTINGS_ENABLEONBOARDINGFLOW": cloudModel.EnvVar{Value: "false"}, + "MM_SERVICESETTINGS_ENABLEUSERTYPINGMESSAGES": cloudModel.EnvVar{Value: "false"}, + "MM_SERVICESETTINGS_SESSIONLENGTHMOBILEINHOURS": cloudModel.EnvVar{Value: "5000"}, + "MM_SERVICESETTINGS_SESSIONCACHEINMINUTES": cloudModel.EnvVar{Value: "180"}, + "MM_SERVICEENVIRONMENT": cloudModel.EnvVar{Value: "test"}, + "MM_RATELIMITSETTINGS_ENABLE": cloudModel.EnvVar{Value: "true"}, + "MM_RATELIMITSETTINGS_PERSEC": cloudModel.EnvVar{Value: "3000"}, + "MM_RATELIMITSETTINGS_MAXBURST": cloudModel.EnvVar{Value: "5000"}, + "MM_RATELIMITSETTINGS_MEMORYSTORESIZE": cloudModel.EnvVar{Value: "10000"}, + "MM_RATELIMITSETTINGS_VARYBYREMOTEADDR": cloudModel.EnvVar{Value: "false"}, + "MM_RATELIMITSETTINGS_VARYBYUSER": cloudModel.EnvVar{Value: "false"}, + "MM_TEAMSETTINGS_EXPERIMENTALENABLEAUTOMATICREPLIES": cloudModel.EnvVar{Value: "true"}, } installationRequest := &cloudModel.CreateInstallationRequest{ @@ -523,9 +528,14 @@ func (s *Server) handleE2ECleanup(pr *model.PullRequest) { // cleanupOrphanedE2EInstances queries the cloud API by DNS LIKE pattern and destroys any matches. func (s *Server) cleanupOrphanedE2EInstances(pr *model.PullRequest, logger logrus.FieldLogger) { - instanceType := "mobile" + var instanceType string if strings.Contains(pr.RepoName, "desktop") { instanceType = "desktop" + } else if strings.Contains(pr.RepoName, "mobile") { + instanceType = "mobile" + } else { + logger.Debug("Skipping orphan E2E cleanup for non-E2E repo") + return } dnsPattern := fmt.Sprintf("%s-pr-%d-%%", instanceType, pr.Number) // e.g. "mobile-pr-9587-%" From ca32cc91f5e5ddb20392fe5bf82d55ba00aaab5e Mon Sep 17 00:00:00 2001 From: yasserfaraazkhan Date: Fri, 10 Apr 2026 19:04:27 +0530 Subject: [PATCH 5/8] feat(e2e): parallel instance creation, push trigger runType, and startup orphan cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Parallelize createMultipleE2EInstances, createMultipleE2EInstancesForPushEvent, and createCMTInstancesForVersion: all 3 platform instances are now created concurrently (desktop: ~30 min → ~30 min total instead of ~90 min; mobile: same improvement). Uses pre-allocated results slice for stable ordering and shared context cancellation so a failing goroutine immediately unblocks siblings instead of letting them sleep. - Add ctx parameter to createCloudInstallation; replace time.Sleep with context-aware select so cancellation propagates within one 30s interval. - Fix PR-closed-during-creation race: after createMultipleE2EInstances returns, check PR state via GitHub API. If the PR was closed during the ~30 min creation window, destroy the instances without tracking them (no further cleanup event will fire). - Add cleanupNonPRE2EInstancesOnStartup: on server start, destroy non-PR E2E instances older than 8 hours that were orphaned by a previous matterwick restart. PR instances are skipped (managed by handleE2ECleanup on PR close). Called before the HTTP listener starts to avoid races with incoming webhook events. - Extend WorkflowRunWithInputs with Event field (GitHub workflow_run.event: "push", "schedule", etc.) and thread it into handleNightlyE2ETrigger. Scheduled runs always get runType="NIGHTLY"; push-triggered runs derive MASTER/RELEASE from branch name. This allows the E2E Nightly Trigger workflow in desktop/mobile repos to serve all three run types from a single workflow file with push + schedule triggers. Co-Authored-By: Claude Sonnet 4.6 --- server/e2e_tests.go | 194 ++++++++++++++++++++++++++++++++++------- server/push_events.go | 72 ++++++++++----- server/server.go | 5 ++ server/workflow_run.go | 107 ++++++++++++++++------- 4 files changed, 294 insertions(+), 84 deletions(-) diff --git a/server/e2e_tests.go b/server/e2e_tests.go index 26c48e8..637fe41 100644 --- a/server/e2e_tests.go +++ b/server/e2e_tests.go @@ -11,6 +11,7 @@ import ( "os" "sort" "strings" + "sync" "time" "github.com/google/go-github/v32/github" @@ -139,6 +140,19 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) { return } + // Instance creation takes ~30 min. Check if the PR was closed during that window. + // If so, destroy the freshly created instances — no further cleanup events will fire + // for a closed PR, so storing them would leak them permanently. + prInfo, _, prErr := newGithubClient(s.Config.GithubAccessToken).PullRequests.Get( + context.Background(), pr.RepoOwner, pr.RepoName, pr.Number) + if prErr != nil { + logger.WithError(prErr).Warn("Failed to check PR state after instance creation; proceeding") + } else if prInfo.GetState() == "closed" { + logger.Warn("PR was closed during E2E instance creation; destroying instances without tracking") + s.destroyE2EInstances(instances, logger) + return + } + s.e2eInstancesLock.Lock() s.e2eInstances[key] = instances s.e2eInstancesLock.Unlock() @@ -159,14 +173,14 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) { logger.Info("Successfully triggered E2E workflow") } -// createMultipleE2EInstances creates multiple instances for E2E testing +// createMultipleE2EInstances creates all platform instances in parallel. +// Results are returned in the same order as platforms[] so that callers can rely on +// index-based platform assignment (e.g. instances[0] = site-1 for mobile). func (s *Server) createMultipleE2EInstances(pr *model.PullRequest, instanceType string, platforms []string) ([]*E2EInstance, error) { if len(platforms) == 0 { return nil, fmt.Errorf("no platforms specified") } - var instances []*E2EInstance - logger := s.Logger.WithFields(logrus.Fields{ "repo": pr.RepoName, "pr": pr.Number, @@ -174,48 +188,74 @@ func (s *Server) createMultipleE2EInstances(pr *model.PullRequest, instanceType "platforms": len(platforms), }) - // Create username and password for this E2E test set - var username, password string - username = s.Config.E2EUsername - - // Get password from environment or generate one - password = s.getE2EPassword(instanceType) - + username := s.Config.E2EUsername + password := s.getE2EPassword(instanceType) // Name format: {type}-pr-{pr}-{platform}-{hex6} uid := e2eUniqueSuffix() - for _, platform := range platforms { - instanceName := e2eInstanceName( - s.Config.DNSNameTestServer, - instanceType, fmt.Sprintf("pr-%d", pr.Number), platform, uid, - ) + // Shared cancellable context: the first goroutine to fail cancels the rest so they + // exit their polling loop within one sleep interval (30s) instead of waiting up to 30min. + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() - logger.WithField("instance", instanceName).Info("Creating E2E instance") + type result struct { + instance *E2EInstance + err error + } + // Pre-allocate by index so each goroutine writes to its own slot — no mutex needed. + results := make([]result, len(platforms)) + var wg sync.WaitGroup + + for i, platform := range platforms { + wg.Add(1) + go func(idx int, platform string) { + defer wg.Done() + instanceName := e2eInstanceName( + s.Config.DNSNameTestServer, + instanceType, fmt.Sprintf("pr-%d", pr.Number), platform, uid, + ) + logger.WithField("instance", instanceName).Info("Creating E2E instance") + inst, err := s.createCloudInstallation(ctx, instanceName, s.Config.E2EServerVersion, username, password, instanceType, logger) + if err != nil { + cancel() // signal sibling goroutines to stop waiting + results[idx] = result{err: err} + return + } + inst.Platform = platform + if instanceType == "desktop" { + inst.Runner = getRunnerForPlatform(platform) + } + results[idx] = result{instance: inst} + }(i, platform) + } - // Create the installation - instance, err := s.createCloudInstallation(instanceName, s.Config.E2EServerVersion, username, password, instanceType, logger) - if err != nil { - logger.WithError(err).Error("Failed to create cloud installation") - // Cleanup already created instances on failure - s.destroyE2EInstances(instances, logger) - return nil, err - } + wg.Wait() - instance.Platform = platform - if instanceType == "desktop" { - // Assign appropriate runner for each platform - instance.Runner = getRunnerForPlatform(platform) + // Collect results in platforms[] order. On any error, destroy all that succeeded. + var instances []*E2EInstance + var firstErr error + for _, r := range results { + if r.err != nil { + if firstErr == nil { + firstErr = r.err + } + } else { + instances = append(instances, r.instance) } + } - instances = append(instances, instance) - logger.WithField("instance", instanceName).Info("Successfully created E2E instance") + if firstErr != nil { + s.destroyE2EInstances(instances, logger) + return nil, firstErr } return instances, nil } -// createCloudInstallation creates a single installation via provisioner API -func (s *Server) createCloudInstallation(name, version, username, password, instanceType string, logger logrus.FieldLogger) (*E2EInstance, error) { +// createCloudInstallation creates a single installation via provisioner API. +// ctx is used to cancel the polling wait so that parallel callers can abort early when a +// sibling goroutine fails, instead of waiting up to 30 minutes per polling interval. +func (s *Server) createCloudInstallation(ctx context.Context, name, version, username, password, instanceType string, logger logrus.FieldLogger) (*E2EInstance, error) { // Create installation request envVars := cloudModel.EnvVarMap{ "MM_SERVICESETTINGS_ENABLETUTORIAL": cloudModel.EnvVar{Value: "false"}, @@ -274,7 +314,17 @@ func (s *Server) createCloudInstallation(name, version, username, password, inst return nil, fmt.Errorf("timeout waiting for installation to become stable") } - time.Sleep(30 * time.Second) + // Context-aware sleep: wake immediately if a sibling goroutine failed and cancelled ctx. + select { + case <-ctx.Done(): + return nil, fmt.Errorf("installation wait cancelled: %w", ctx.Err()) + case <-time.After(30 * time.Second): + } + } + + // Check cancellation before the 10-minute initialization phase (DNS + ping + user setup). + if err := ctx.Err(); err != nil { + return nil, fmt.Errorf("installation creation cancelled before initialization: %w", err) } // Initialize Mattermost server with provided credentials @@ -574,6 +624,84 @@ func (s *Server) cleanupOrphanedE2EInstances(pr *model.PullRequest, logger logru } } +// cleanupNonPRE2EInstancesOnStartup destroys stale non-PR E2E instances left over from a +// previous matterwick run. When matterwick restarts, the in-memory tracking map is wiped, +// making push/nightly/CMT instances untrackable — the workflow_run(completed) cleanup path +// will find nothing. A 8-hour age threshold is used so that instances still being actively +// used by tests that started before the restart are NOT destroyed mid-run. +// +// Reasoning: E2E tests (nightly, push, CMT) take at most a few hours. Any non-PR instance +// older than 8 hours whose tracking was lost must be orphaned. +// +// PR instances (identified by "-pr-" in their OwnerID) are always skipped — handleE2ECleanup +// on PR close, which includes DNS orphan cleanup, manages their lifecycle. +func (s *Server) cleanupNonPRE2EInstancesOnStartup() { + const maxAge = 8 * time.Hour + logger := s.Logger.WithField("type", "startup_e2e_cleanup") + logger.WithField("max_age_hours", 8).Info("Scanning for stale non-PR E2E instances from previous matterwick run") + + cutoffMs := time.Now().Add(-maxAge).UnixMilli() + + for _, instanceType := range []string{"desktop", "mobile"} { + pattern := instanceType + "-%" + installations, err := s.CloudClient.GetInstallations(&cloudModel.GetInstallationsRequest{ + DNS: pattern, + Paging: cloudModel.AllPagesNotDeleted(), + }) + if err != nil { + logger.WithError(err).Errorf("Failed to query %s instances on startup", instanceType) + continue + } + + for _, inst := range installations { + // Guard against a nil embedded Installation — must come first, all other + // field accesses (OwnerID, CreateAt, State, ID) are on the embedded struct. + if inst.Installation == nil { + logger.Warn("Skipping instance with nil Installation pointer in startup cleanup") + continue + } + + // PR instances have "-pr-" in their OwnerID (e.g. "mobile-pr-123-site-1-..."). + // Skip them — handleE2ECleanup on PR close manages their lifecycle. + if strings.Contains(inst.OwnerID, "-pr-") { + continue + } + + // Skip instances created within the last 8 hours — tests may still be running. + // If matterwick restarted mid-test, destroying active test instances would break them. + if inst.CreateAt > cutoffMs { + logger.WithFields(logrus.Fields{ + "installation_id": inst.ID, + "owner_id": inst.OwnerID, + }).Debug("Skipping non-PR instance younger than 8h (may still be in use)") + continue + } + + // Skip instances already progressing through deletion. + if inst.State == cloudModel.InstallationStateDeletionPendingRequested || + inst.State == cloudModel.InstallationStateDeletionPendingInProgress || + inst.State == cloudModel.InstallationStateDeletionPending || + inst.State == cloudModel.InstallationStateDeletionRequested || + inst.State == cloudModel.InstallationStateDeletionFailed || + inst.State == cloudModel.InstallationStateDeleted { + continue + } + + instLogger := logger.WithFields(logrus.Fields{ + "installation_id": inst.ID, + "owner_id": inst.OwnerID, + "state": inst.State, + }) + instLogger.Warn("Destroying stale non-PR E2E instance (older than 8h) left from previous matterwick run") + if err := s.CloudClient.DeleteInstallation(inst.ID); err != nil { + instLogger.WithError(err).Error("Failed to destroy stale non-PR E2E instance") + } + } + } + + logger.Info("Startup non-PR E2E instance cleanup complete") +} + // destroyE2EInstances destroys all given E2E instances func (s *Server) destroyE2EInstances(instances []*E2EInstance, logger logrus.FieldLogger) { for _, instance := range instances { diff --git a/server/push_events.go b/server/push_events.go index 1c58484..a12f45c 100644 --- a/server/push_events.go +++ b/server/push_events.go @@ -4,8 +4,10 @@ package server import ( + "context" "fmt" "strings" + "sync" "github.com/google/go-github/v32/github" "github.com/sirupsen/logrus" @@ -135,12 +137,10 @@ func (s *Server) handlePushEventE2E(event *github.PushEvent, branch string, vers logger.Info("E2E workflow triggered successfully and instances tracked for cleanup") } -// createMultipleE2EInstancesForPushEvent creates instances for push event E2E testing +// createMultipleE2EInstancesForPushEvent creates all platform instances in parallel. +// Results are returned in platforms[] order so index-based assignment is stable. func (s *Server) createMultipleE2EInstancesForPushEvent(repoName, instanceType, branch, version, _ string) ([]*E2EInstance, error) { - var instances []*E2EInstance var platforms []string - - // For push events, always use all platforms if instanceType == "desktop" { platforms = []string{"linux", "macos", "windows"} } else { @@ -164,25 +164,55 @@ func (s *Server) createMultipleE2EInstancesForPushEvent(repoName, instanceType, username := s.Config.E2EUsername password := s.getE2EPassword(instanceType) - for _, platform := range platforms { - name := e2eInstanceName( - s.Config.DNSNameTestServer, - instanceType, sanitizedVersion, platform, uid, - ) - - instance, err := s.createCloudInstallation(name, serverVersion, username, password, instanceType, logger) - if err != nil { - logger.WithError(err).Errorf("Failed to create instance for platform %s", platform) - // Cleanup already created instances on failure - s.destroyE2EInstances(instances, logger) - return nil, err - } + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + type result struct { + instance *E2EInstance + err error + } + results := make([]result, len(platforms)) + var wg sync.WaitGroup + + for i, platform := range platforms { + wg.Add(1) + go func(idx int, platform string) { + defer wg.Done() + name := e2eInstanceName( + s.Config.DNSNameTestServer, + instanceType, sanitizedVersion, platform, uid, + ) + inst, err := s.createCloudInstallation(ctx, name, serverVersion, username, password, instanceType, logger) + if err != nil { + cancel() + results[idx] = result{err: err} + return + } + inst.Platform = platform + if instanceType == "desktop" { + inst.Runner = getRunnerForPlatform(platform) + } + results[idx] = result{instance: inst} + }(i, platform) + } - instance.Platform = platform - if instanceType == "desktop" { - instance.Runner = getRunnerForPlatform(platform) + wg.Wait() + + var instances []*E2EInstance + var firstErr error + for _, r := range results { + if r.err != nil { + if firstErr == nil { + firstErr = r.err + } + } else { + instances = append(instances, r.instance) } - instances = append(instances, instance) + } + + if firstErr != nil { + s.destroyE2EInstances(instances, logger) + return nil, firstErr } logger.WithField("instanceCount", len(instances)).Info("All E2E instances created successfully") diff --git a/server/server.go b/server/server.go index 5b0fe78..1322c35 100644 --- a/server/server.go +++ b/server/server.go @@ -98,6 +98,11 @@ func New(config *MatterwickConfig) *Server { func (s *Server) Start() { s.Logger.Info("Starting MatterWick Server") + // Destroy any non-PR E2E instances left over from a previous run. + // Must run before the HTTP listener starts so that cleanup completes before + // new webhook events can arrive and re-create conflicting instances. + s.cleanupNonPRE2EInstancesOnStartup() + s.initializeRouter() var handler http.Handler = s.Router diff --git a/server/workflow_run.go b/server/workflow_run.go index abc626a..5fe1200 100644 --- a/server/workflow_run.go +++ b/server/workflow_run.go @@ -9,6 +9,7 @@ import ( "fmt" "io" "strings" + "sync" "github.com/sirupsen/logrus" ) @@ -27,6 +28,7 @@ type WorkflowRunWithInputs struct { Name string `json:"name"` HeadBranch string `json:"head_branch"` HeadSHA string `json:"head_sha"` + Event string `json:"event"` // triggering event: "push", "schedule", "workflow_dispatch", etc. Inputs map[string]string `json:"inputs"` } @@ -122,7 +124,7 @@ func (s *Server) handleWorkflowRunEventWithInputs(payload *WorkflowRunWebhookPay if s.Config.E2ENightlyTriggerWorkflowName != "" && workflowName == s.Config.E2ENightlyTriggerWorkflowName { if payload.Action == "requested" { logger.Info("Nightly trigger workflow started, provisioning E2E servers") - go s.handleNightlyE2ETrigger(owner, repoName, headBranch, headSHA, logger) + go s.handleNightlyE2ETrigger(owner, repoName, headBranch, headSHA, payload.WorkflowRun.Event, logger) } return } @@ -137,9 +139,11 @@ func (s *Server) handleWorkflowRunEventWithInputs(payload *WorkflowRunWebhookPay logger.Debug("Ignoring workflow_run event (not relevant to E2E lifecycle)") } -// handleNightlyE2ETrigger provisions instances and dispatches the test workflow -// for scheduled/nightly E2E runs. Called when the nightly trigger workflow starts. -func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha string, logger logrus.FieldLogger) { +// handleNightlyE2ETrigger provisions instances and dispatches the test workflow. +// Called when the E2E trigger workflow starts, whether from schedule, push to master/main, +// or push to a release branch. The triggerEvent parameter ("schedule", "push", etc.) is +// used to set runType correctly — scheduled runs always get "NIGHTLY" regardless of branch. +func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha, triggerEvent string, logger logrus.FieldLogger) { logger = logger.WithFields(logrus.Fields{ "branch": branch, "sha": sha, @@ -168,6 +172,21 @@ func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha string, lo logger.WithField("tracking_key", key).Info("Nightly instances tracked, dispatching test workflow") + // Determine run classification. Scheduled runs are always NIGHTLY regardless of branch + // (a scheduled run on master must not be classified as MASTER). Push-triggered runs + // derive their type from the branch name. + runType := "NIGHTLY" + nightly := true + if triggerEvent != "schedule" { + if branch == "master" || branch == "main" { + runType = "MASTER" + nightly = false + } else if s.isReleaseBranch(branch) { + runType = "RELEASE" + nightly = false + } + } + var dispatchErr error if instanceType == "desktop" { instanceDetailsJSON, err := s.buildInstanceDetailsJSON(instances) @@ -180,8 +199,7 @@ func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha string, lo return } // Dispatch to the exact SHA so workflow_run completed event matches the tracking key. - // Pass runType="NIGHTLY" and nightly=true so the workflow correctly classifies this run. - dispatchErr = s.dispatchDesktopE2EWorkflow(owner, repoName, sha, sha, instanceDetailsJSON, "NIGHTLY", true) + dispatchErr = s.dispatchDesktopE2EWorkflow(owner, repoName, sha, sha, instanceDetailsJSON, runType, nightly) } else { if len(instances) < 3 { logger.Errorf("Expected 3 mobile instances, got %d", len(instances)) @@ -192,9 +210,8 @@ func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha string, lo return } // Dispatch to the exact SHA so workflow_run completed event matches the tracking key. - // Pass runType="NIGHTLY" so the workflow correctly classifies this as a nightly run. dispatchErr = s.dispatchMobileE2EWorkflow(owner, repoName, sha, sha, - instances[0].URL, instances[1].URL, instances[2].URL, "both", "NIGHTLY") + instances[0].URL, instances[1].URL, instances[2].URL, "both", runType) } if dispatchErr != nil { @@ -361,7 +378,7 @@ func (s *Server) createSingleCMTInstance(repoName, instanceType, version string, username := s.Config.E2EUsername password := s.getE2EPassword(instanceType) - return s.createCloudInstallation(name, version, username, password, instanceType, logger) + return s.createCloudInstallation(context.Background(), name, version, username, password, instanceType, logger) } // cmtServer is the server entry in CMT_MATRIX JSON. @@ -495,13 +512,12 @@ func (s *Server) dispatchCMTWorkflow(repoOwner, repoName, sha, branch, cmtMatrix return nil } -// createCMTInstancesForVersion creates 3 instances (one per platform) for a given server -// version. Used by nightly runs which dispatch the platform-aware e2e-functional.yml / -// e2e-detox-pr.yml workflows (not the CMT matrix workflow). +// createCMTInstancesForVersion creates 3 instances (one per platform) in parallel for a +// given server version. Used by nightly runs which dispatch the platform-aware +// e2e-functional.yml / e2e-detox-pr.yml workflows (not the CMT matrix workflow). +// Results are returned in platforms[] order so index-based assignment is stable. func (s *Server) createCMTInstancesForVersion(repoName, instanceType, version, purpose string) ([]*E2EInstance, error) { - var instances []*E2EInstance var platforms []string - if instanceType == "desktop" { platforms = []string{"linux", "macos", "windows"} } else { @@ -521,25 +537,56 @@ func (s *Server) createCMTInstancesForVersion(repoName, instanceType, version, p username := s.Config.E2EUsername password := s.getE2EPassword(instanceType) - for _, platform := range platforms { - name := e2eInstanceName( - s.Config.DNSNameTestServer, - instanceType, sanitizedVersion, platform, uid, - ) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + type result struct { + instance *E2EInstance + err error + } + results := make([]result, len(platforms)) + var wg sync.WaitGroup + + for i, platform := range platforms { + wg.Add(1) + go func(idx int, platform string) { + defer wg.Done() + name := e2eInstanceName( + s.Config.DNSNameTestServer, + instanceType, sanitizedVersion, platform, uid, + ) + inst, err := s.createCloudInstallation(ctx, name, version, username, password, instanceType, logger) + if err != nil { + cancel() + results[idx] = result{err: err} + return + } + inst.Platform = platform + if instanceType == "desktop" { + inst.Runner = getRunnerForPlatform(platform) + } + results[idx] = result{instance: inst} + }(i, platform) + } + + wg.Wait() - instance, err := s.createCloudInstallation(name, version, username, password, instanceType, logger) - if err != nil { - logger.WithError(err).Errorf("Failed to create instance for platform %s", platform) - // Cleanup already created instances on failure - s.destroyE2EInstances(instances, logger) - return nil, err + var instances []*E2EInstance + var firstErr error + for _, r := range results { + if r.err != nil { + if firstErr == nil { + firstErr = r.err + } + } else { + instances = append(instances, r.instance) } + } - instance.Platform = platform - if instanceType == "desktop" { - instance.Runner = getRunnerForPlatform(platform) - } - instances = append(instances, instance) + if firstErr != nil { + logger.WithError(firstErr).Error("Failed to create one or more instances; destroying all") + s.destroyE2EInstances(instances, logger) + return nil, firstErr } logger.WithField("instanceCount", len(instances)).Info("Instances created for version") From 1b66b023cc8ba4422347fa313028f9c1d02e6281 Mon Sep 17 00:00:00 2001 From: yasserfaraazkhan Date: Sat, 11 Apr 2026 09:36:40 +0530 Subject: [PATCH 6/8] fix(e2e): orphan cleanup, platform validation, and runID in nightly key - createCloudInstallation: delete the cloud installation on all failure paths that occur after CreateInstallation succeeds (GetInstallation error, timeout, and both ctx cancellation points). Previously those paths returned without cleaning up, permanently orphaning the installation since it is never added to the in-memory tracking map. - findExistingE2EInstancesInCloud: replace sort-by-OwnerID index assignment with explicit platform parsing. Strip the 8-char hex uid suffix from each OwnerID, match against the known platforms slice, reject duplicates and missing platforms with a clear error, then build the result by iterating platforms[] in order via a map. The old approach assigned platforms by coincidental alphabetical sort order and silently misassigned them when duplicates were present. - handleNightlyE2ETrigger: include runID in the tracking key ("{repo}-scheduled-{runID}-{sha}") so that two trigger runs against the same SHA (e.g. a manual re-trigger) get separate map entries instead of overwriting each other. The key still ends with "-{sha}" so findAndDestroyInstancesBySHA continues to match by suffix. Co-Authored-By: Claude Sonnet 4.6 --- server/e2e_tests.go | 64 ++++++++++++++++++++++++++++++++++-------- server/workflow_run.go | 11 +++++--- 2 files changed, 59 insertions(+), 16 deletions(-) diff --git a/server/e2e_tests.go b/server/e2e_tests.go index 637fe41..214b8c8 100644 --- a/server/e2e_tests.go +++ b/server/e2e_tests.go @@ -9,7 +9,6 @@ import ( "fmt" "net/url" "os" - "sort" "strings" "sync" "time" @@ -295,6 +294,19 @@ func (s *Server) createCloudInstallation(ctx context.Context, name, version, use return nil, fmt.Errorf("failed to create installation: %w", err) } + // deleteInstallation is a best-effort cleanup helper used on all failure paths after + // CreateInstallation succeeds. Without it, the cloud installation would be permanently + // orphaned because it has not yet been added to the in-memory tracking map. + deleteInstallation := func(reason string) { + logger.WithFields(logrus.Fields{ + "installation_id": installation.ID, + "reason": reason, + }).Warn("Deleting partially created installation to avoid orphan") + if delErr := s.CloudClient.DeleteInstallation(installation.ID); delErr != nil { + logger.WithError(delErr).WithField("installation_id", installation.ID).Error("Failed to delete orphaned installation") + } + } + logger.WithField("installation_id", installation.ID).Info("Installation created, waiting for stable state") // Wait for installation to be stable using polling with timeout @@ -302,6 +314,7 @@ func (s *Server) createCloudInstallation(ctx context.Context, name, version, use for { inst, err := s.CloudClient.GetInstallation(installation.ID, nil) if err != nil { + deleteInstallation("GetInstallation error during polling") return nil, fmt.Errorf("failed to get installation status: %w", err) } @@ -311,12 +324,14 @@ func (s *Server) createCloudInstallation(ctx context.Context, name, version, use } if time.Now().After(timeout) { + deleteInstallation("timeout waiting for stable state") return nil, fmt.Errorf("timeout waiting for installation to become stable") } // Context-aware sleep: wake immediately if a sibling goroutine failed and cancelled ctx. select { case <-ctx.Done(): + deleteInstallation("context cancelled during polling") return nil, fmt.Errorf("installation wait cancelled: %w", ctx.Err()) case <-time.After(30 * time.Second): } @@ -324,6 +339,7 @@ func (s *Server) createCloudInstallation(ctx context.Context, name, version, use // Check cancellation before the 10-minute initialization phase (DNS + ping + user setup). if err := ctx.Err(); err != nil { + deleteInstallation("context cancelled before initialization") return nil, fmt.Errorf("installation creation cancelled before initialization: %w", err) } @@ -741,20 +757,44 @@ func (s *Server) findExistingE2EInstancesInCloud(pr *model.PullRequest, instance } } - if len(reusable) != len(platforms) { - return nil, fmt.Errorf("found %d reusable instances, expected %d", len(reusable), len(platforms)) + // Parse the platform token from each OwnerID. + // OwnerID format: {type}-{version}-{platform}-{uid} where uid is 8 hex chars. + // Strip the trailing "-{uid}" (9 chars) then check which expected platform is a suffix. + platformByInst := make(map[string]*cloudModel.InstallationDTO, len(reusable)) // platform → inst + for _, inst := range reusable { + ownerID := inst.OwnerID + var matched string + for _, p := range platforms { + // OwnerID without uid ends with "-{platform}"; uid is 9 chars ("-" + 8 hex). + withoutUID := ownerID + if len(ownerID) > 9 { + withoutUID = ownerID[:len(ownerID)-9] + } + if strings.HasSuffix(withoutUID, "-"+p) { + matched = p + break + } + } + if matched == "" { + return nil, fmt.Errorf("could not determine platform for instance %q", ownerID) + } + if _, dup := platformByInst[matched]; dup { + return nil, fmt.Errorf("duplicate platform %q found among cloud instances", matched) + } + platformByInst[matched] = inst } - // Sort by OwnerID for consistent platform assignment across calls. - // desktop: linux < macos < windows; mobile: site-1 < site-2 < site-3 - sort.Slice(reusable, func(i, j int) bool { - return reusable[i].OwnerID < reusable[j].OwnerID - }) + // Validate that every expected platform is present. + for _, p := range platforms { + if _, ok := platformByInst[p]; !ok { + return nil, fmt.Errorf("expected platform %q not found among cloud instances (found %d, want %d)", p, len(reusable), len(platforms)) + } + } - result := make([]*E2EInstance, len(reusable)) - for i, inst := range reusable { - platform := platforms[i] - // Reconstruct URL from OwnerID + DNSNameTestServer to avoid needing DNS records in list response. + // Build result in platforms[] order so index-based assignment is stable for callers. + result := make([]*E2EInstance, len(platforms)) + for i, platform := range platforms { + inst := platformByInst[platform] e2eInst := &E2EInstance{ Name: inst.OwnerID, Platform: platform, diff --git a/server/workflow_run.go b/server/workflow_run.go index 5fe1200..db3e5c8 100644 --- a/server/workflow_run.go +++ b/server/workflow_run.go @@ -124,7 +124,7 @@ func (s *Server) handleWorkflowRunEventWithInputs(payload *WorkflowRunWebhookPay if s.Config.E2ENightlyTriggerWorkflowName != "" && workflowName == s.Config.E2ENightlyTriggerWorkflowName { if payload.Action == "requested" { logger.Info("Nightly trigger workflow started, provisioning E2E servers") - go s.handleNightlyE2ETrigger(owner, repoName, headBranch, headSHA, payload.WorkflowRun.Event, logger) + go s.handleNightlyE2ETrigger(owner, repoName, headBranch, headSHA, payload.WorkflowRun.Event, runID, logger) } return } @@ -143,10 +143,11 @@ func (s *Server) handleWorkflowRunEventWithInputs(payload *WorkflowRunWebhookPay // Called when the E2E trigger workflow starts, whether from schedule, push to master/main, // or push to a release branch. The triggerEvent parameter ("schedule", "push", etc.) is // used to set runType correctly — scheduled runs always get "NIGHTLY" regardless of branch. -func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha, triggerEvent string, logger logrus.FieldLogger) { +func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha, triggerEvent string, runID int64, logger logrus.FieldLogger) { logger = logger.WithFields(logrus.Fields{ "branch": branch, "sha": sha, + "run_id": runID, }) logger.Info("Provisioning nightly E2E instances") @@ -164,8 +165,10 @@ func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha, triggerEv return } - // Track by sha so the test workflow completion can clean up - key := fmt.Sprintf("%s-scheduled-%s", repoName, sha) + // Include runID so two trigger runs against the same SHA (e.g. manual re-trigger) + // get separate tracking keys. The key still ends with "-{sha}" so + // findAndDestroyInstancesBySHA continues to match it by suffix. + key := fmt.Sprintf("%s-scheduled-%d-%s", repoName, runID, sha) s.e2eInstancesLock.Lock() s.e2eInstances[key] = instances s.e2eInstancesLock.Unlock() From 6298fb7264068ac5f8d5c70f27e0b747889de70e Mon Sep 17 00:00:00 2001 From: yasserfaraazkhan Date: Sat, 11 Apr 2026 09:40:19 +0530 Subject: [PATCH 7/8] fix(e2e): cleanup installation on initializeMattermostE2EServer failure Co-Authored-By: Claude Sonnet 4.6 --- server/e2e_tests.go | 1 + 1 file changed, 1 insertion(+) diff --git a/server/e2e_tests.go b/server/e2e_tests.go index 214b8c8..c95e4a9 100644 --- a/server/e2e_tests.go +++ b/server/e2e_tests.go @@ -347,6 +347,7 @@ func (s *Server) createCloudInstallation(ctx context.Context, name, version, use spinwickURL := fmt.Sprintf("https://%s", cloudtools.GetInstallationDNSFromDNSRecords(installation)) err = s.initializeMattermostE2EServer(spinwickURL, username, password, logger) if err != nil { + deleteInstallation("initializeMattermostE2EServer failed") return nil, fmt.Errorf("failed to initialize Mattermost server: %w", err) } From eb140b3df83e193893750afdf333b1100323f030 Mon Sep 17 00:00:00 2001 From: yasserfaraazkhan Date: Sat, 11 Apr 2026 09:46:26 +0530 Subject: [PATCH 8/8] fix(e2e): pre-flight ctx check and cleanupCreatedInstallation helper Add early ctx.Err() check before CreateInstallation to avoid starting a cloud installation when context is already cancelled. Replace the void deleteInstallation helper with cleanupCreatedInstallation(cause error) error so each failure path is a single return statement and the cause is returned unchanged after best-effort cleanup. Co-Authored-By: Claude Sonnet 4.6 --- server/e2e_tests.go | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/server/e2e_tests.go b/server/e2e_tests.go index c95e4a9..6db2d0f 100644 --- a/server/e2e_tests.go +++ b/server/e2e_tests.go @@ -255,6 +255,10 @@ func (s *Server) createMultipleE2EInstances(pr *model.PullRequest, instanceType // ctx is used to cancel the polling wait so that parallel callers can abort early when a // sibling goroutine fails, instead of waiting up to 30 minutes per polling interval. func (s *Server) createCloudInstallation(ctx context.Context, name, version, username, password, instanceType string, logger logrus.FieldLogger) (*E2EInstance, error) { + if err := ctx.Err(); err != nil { + return nil, fmt.Errorf("installation creation cancelled before request: %w", err) + } + // Create installation request envVars := cloudModel.EnvVarMap{ "MM_SERVICESETTINGS_ENABLETUTORIAL": cloudModel.EnvVar{Value: "false"}, @@ -294,17 +298,15 @@ func (s *Server) createCloudInstallation(ctx context.Context, name, version, use return nil, fmt.Errorf("failed to create installation: %w", err) } - // deleteInstallation is a best-effort cleanup helper used on all failure paths after + // cleanupCreatedInstallation is a best-effort cleanup helper used on all failure paths after // CreateInstallation succeeds. Without it, the cloud installation would be permanently // orphaned because it has not yet been added to the in-memory tracking map. - deleteInstallation := func(reason string) { - logger.WithFields(logrus.Fields{ - "installation_id": installation.ID, - "reason": reason, - }).Warn("Deleting partially created installation to avoid orphan") + // It deletes the installation, logs any deletion error, and returns cause unchanged. + cleanupCreatedInstallation := func(cause error) error { if delErr := s.CloudClient.DeleteInstallation(installation.ID); delErr != nil { - logger.WithError(delErr).WithField("installation_id", installation.ID).Error("Failed to delete orphaned installation") + logger.WithError(delErr).WithField("installation_id", installation.ID).Error("Failed to clean up partially created installation") } + return cause } logger.WithField("installation_id", installation.ID).Info("Installation created, waiting for stable state") @@ -314,8 +316,7 @@ func (s *Server) createCloudInstallation(ctx context.Context, name, version, use for { inst, err := s.CloudClient.GetInstallation(installation.ID, nil) if err != nil { - deleteInstallation("GetInstallation error during polling") - return nil, fmt.Errorf("failed to get installation status: %w", err) + return nil, cleanupCreatedInstallation(fmt.Errorf("failed to get installation status: %w", err)) } if inst.State == cloudModel.InstallationStateStable || inst.State == cloudModel.InstallationStateHibernating { @@ -324,31 +325,27 @@ func (s *Server) createCloudInstallation(ctx context.Context, name, version, use } if time.Now().After(timeout) { - deleteInstallation("timeout waiting for stable state") - return nil, fmt.Errorf("timeout waiting for installation to become stable") + return nil, cleanupCreatedInstallation(fmt.Errorf("timeout waiting for installation to become stable")) } // Context-aware sleep: wake immediately if a sibling goroutine failed and cancelled ctx. select { case <-ctx.Done(): - deleteInstallation("context cancelled during polling") - return nil, fmt.Errorf("installation wait cancelled: %w", ctx.Err()) + return nil, cleanupCreatedInstallation(fmt.Errorf("installation wait cancelled: %w", ctx.Err())) case <-time.After(30 * time.Second): } } // Check cancellation before the 10-minute initialization phase (DNS + ping + user setup). if err := ctx.Err(); err != nil { - deleteInstallation("context cancelled before initialization") - return nil, fmt.Errorf("installation creation cancelled before initialization: %w", err) + return nil, cleanupCreatedInstallation(fmt.Errorf("installation creation cancelled before initialization: %w", err)) } // Initialize Mattermost server with provided credentials spinwickURL := fmt.Sprintf("https://%s", cloudtools.GetInstallationDNSFromDNSRecords(installation)) err = s.initializeMattermostE2EServer(spinwickURL, username, password, logger) if err != nil { - deleteInstallation("initializeMattermostE2EServer failed") - return nil, fmt.Errorf("failed to initialize Mattermost server: %w", err) + return nil, cleanupCreatedInstallation(fmt.Errorf("failed to initialize Mattermost server: %w", err)) } return &E2EInstance{