From eca75bd5656e7e228a4beb33b8ad98f1b4488f4b Mon Sep 17 00:00:00 2001 From: yasserfaraazkhan Date: Thu, 16 Apr 2026 15:55:36 +0530 Subject: [PATCH 1/7] Fix E2E instance creating throwing 409 when we add label --- server/config.go | 6 +++ server/e2e_tests.go | 93 ++++++++++++++++++++++++++++++------------ server/push_events.go | 31 +++++++------- server/server.go | 33 ++++++++++++--- server/workflow_run.go | 59 +++++++++++++++++++-------- 5 files changed, 157 insertions(+), 65 deletions(-) diff --git a/server/config.go b/server/config.go index af5d187..0dc62eb 100644 --- a/server/config.go +++ b/server/config.go @@ -116,6 +116,12 @@ type MatterwickConfig struct { E2EReleasePatternPrefix string E2ENightlyTriggerWorkflowName string // workflow name (name: field) of the nightly trigger workflow E2ETestWorkflowNames []string // workflow names of the actual test workflows (for completion-based cleanup) + // E2EInstanceMaxAge is the minimum age (in hours) a non-PR E2E instance must reach + // before the periodic orphan-cleanup scan will delete it. This prevents the scan + // from destroying instances that are still being used by a currently-running test. + // Set to the longest expected E2E run duration plus a small buffer. + // Default (0): 3 hours. + E2EInstanceMaxAge int } func findConfigFile(fileName string) string { diff --git a/server/e2e_tests.go b/server/e2e_tests.go index ec44b3f..61f8fd8 100644 --- a/server/e2e_tests.go +++ b/server/e2e_tests.go @@ -34,9 +34,12 @@ type E2EInstance struct { ServerVersion string `json:"server_version"` } -// e2eUniqueSuffix returns an 8-character hex timestamp for instance name uniqueness. +// e2eUniqueSuffix returns an 8-character hex suffix for instance name uniqueness. +// Uses the lower 32 bits of the nanosecond clock so that two calls within the +// same second produce different values, preventing 409 conflicts when duplicate +// webhook deliveries trigger concurrent instance creation. func e2eUniqueSuffix() string { - return fmt.Sprintf("%08x", time.Now().Unix()) + return fmt.Sprintf("%08x", uint32(time.Now().UnixNano())) } // sanitizeForDNS lowercases and replaces non-DNS characters with hyphens. @@ -94,6 +97,23 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) { key := fmt.Sprintf("%s-pr-%d", pr.RepoName, pr.Number) + // Guard against duplicate webhook deliveries: if another goroutine is already + // handling this PR key (check, cloud lookup, or 30-min creation), drop the + // duplicate rather than racing to create conflicting cloud installations. + s.e2eInProgressLock.Lock() + if s.e2eInProgress[key] { + s.e2eInProgressLock.Unlock() + logger.Warn("E2E instance creation already in progress for this PR, skipping duplicate request") + return + } + s.e2eInProgress[key] = true + s.e2eInProgressLock.Unlock() + defer func() { + s.e2eInProgressLock.Lock() + delete(s.e2eInProgress, key) + s.e2eInProgressLock.Unlock() + }() + // 1. Reuse existing in-memory instances (servers stay alive between label toggles). s.e2eInstancesLock.Lock() existingInstances := s.e2eInstances[key] @@ -639,21 +659,33 @@ func (s *Server) cleanupOrphanedE2EInstances(pr *model.PullRequest, logger logru } } -// cleanupNonPRE2EInstancesOnStartup destroys stale non-PR E2E instances left over from a -// previous matterwick run. When matterwick restarts, the in-memory tracking map is wiped, -// making push/nightly/CMT instances untrackable — the workflow_run(completed) cleanup path -// will find nothing. A 8-hour age threshold is used so that instances still being actively -// used by tests that started before the restart are NOT destroyed mid-run. +// e2eInstanceMaxAge returns the configured maximum age for non-PR E2E instances before +// they are considered orphaned and eligible for deletion by the periodic cleanup scan. +// Falls back to 3 hours when the config value is 0 (unset). +func (s *Server) e2eInstanceMaxAge() time.Duration { + if s.Config.E2EInstanceMaxAge > 0 { + return time.Duration(s.Config.E2EInstanceMaxAge) * time.Hour + } + return 3 * time.Hour +} + +// cleanupStaleNonPRE2EInstances destroys non-PR E2E instances (push/nightly/CMT) that +// are older than e2eInstanceMaxAge() and not already in a deletion state. +// +// Called on startup (to recover from a previous matterwick crash/restart that wiped the +// in-memory tracking map) and periodically during normal operation (so orphaned instances +// from a mid-run restart are cleaned up without waiting for the next matterwick restart). // -// Reasoning: E2E tests (nightly, push, CMT) take at most a few hours. Any non-PR instance -// older than 8 hours whose tracking was lost must be orphaned. +// The age threshold prevents destroying instances that are still actively used by a test +// that started before the restart. Set E2EInstanceMaxAge to a value larger than the +// longest expected E2E test run duration. // // PR instances (identified by "-pr-" in their OwnerID) are always skipped — handleE2ECleanup -// on PR close, which includes DNS orphan cleanup, manages their lifecycle. -func (s *Server) cleanupNonPRE2EInstancesOnStartup() { - const maxAge = 8 * time.Hour - logger := s.Logger.WithField("type", "startup_e2e_cleanup") - logger.WithField("max_age_hours", 8).Info("Scanning for stale non-PR E2E instances from previous matterwick run") +// on PR close manages their lifecycle via cloud-API orphan scan. +func (s *Server) cleanupStaleNonPRE2EInstances() { + maxAge := s.e2eInstanceMaxAge() + logger := s.Logger.WithField("type", "periodic_e2e_cleanup") + logger.WithField("max_age_hours", maxAge.Hours()).Info("Scanning for stale non-PR E2E instances") cutoffMs := time.Now().Add(-maxAge).UnixMilli() @@ -664,7 +696,7 @@ func (s *Server) cleanupNonPRE2EInstancesOnStartup() { Paging: cloudModel.AllPagesNotDeleted(), }) if err != nil { - logger.WithError(err).Errorf("Failed to query %s instances on startup", instanceType) + logger.WithError(err).Errorf("Failed to query %s instances", instanceType) continue } @@ -672,7 +704,7 @@ func (s *Server) cleanupNonPRE2EInstancesOnStartup() { // Guard against a nil embedded Installation — must come first, all other // field accesses (OwnerID, CreateAt, State, ID) are on the embedded struct. if inst.Installation == nil { - logger.Warn("Skipping instance with nil Installation pointer in startup cleanup") + logger.Warn("Skipping instance with nil Installation pointer in cleanup scan") continue } @@ -682,13 +714,12 @@ func (s *Server) cleanupNonPRE2EInstancesOnStartup() { continue } - // Skip instances created within the last 8 hours — tests may still be running. - // If matterwick restarted mid-test, destroying active test instances would break them. + // Skip instances younger than maxAge — a test may still be using them. if inst.CreateAt > cutoffMs { logger.WithFields(logrus.Fields{ "installation_id": inst.ID, "owner_id": inst.OwnerID, - }).Debug("Skipping non-PR instance younger than 8h (may still be in use)") + }).Debug("Skipping non-PR instance younger than max age (may still be in use)") continue } @@ -707,14 +738,14 @@ func (s *Server) cleanupNonPRE2EInstancesOnStartup() { "owner_id": inst.OwnerID, "state": inst.State, }) - instLogger.Warn("Destroying stale non-PR E2E instance (older than 8h) left from previous matterwick run") + instLogger.Warn("Destroying stale non-PR E2E instance") if err := s.CloudClient.DeleteInstallation(inst.ID); err != nil { instLogger.WithError(err).Error("Failed to destroy stale non-PR E2E instance") } } } - logger.Info("Startup non-PR E2E instance cleanup complete") + logger.Info("Non-PR E2E instance cleanup scan complete") } // resolveE2EServerVersion returns the Mattermost server version to use for E2E instances. @@ -1001,8 +1032,11 @@ func (s *Server) buildInstanceDetailsJSON(instances []*E2EInstance) (string, err return string(jsonBytes), nil } -// dispatchDesktopE2EWorkflow triggers the desktop E2E workflow via GitHub Actions API -func (s *Server) dispatchDesktopE2EWorkflow(repoOwner, repoName, ref, sha, instanceDetailsJSON, runType string, nightly bool) error { +// dispatchDesktopE2EWorkflow triggers the desktop E2E workflow via GitHub Actions API. +// trackingKey is the s.e2eInstances map key for this run; when non-empty it is passed +// as the "mw_tracking_key" workflow input so the workflow_run completed handler can do +// a direct key lookup instead of fragile SHA suffix matching. +func (s *Server) dispatchDesktopE2EWorkflow(repoOwner, repoName, ref, sha, instanceDetailsJSON, runType, trackingKey string, nightly bool) error { ctx := context.Background() client := newGithubClient(s.Config.GithubAccessToken) @@ -1035,6 +1069,9 @@ func (s *Server) dispatchDesktopE2EWorkflow(repoOwner, repoName, ref, sha, insta "run_type": runType, "nightly": fmt.Sprintf("%t", nightly), } + if trackingKey != "" { + workflowInputs["mw_tracking_key"] = trackingKey + } // Use REST API to trigger workflow dispatch (v32 go-github compatibility) req, err := client.NewRequest("POST", @@ -1063,8 +1100,11 @@ func (s *Server) dispatchDesktopE2EWorkflow(repoOwner, repoName, ref, sha, insta return nil } -// dispatchMobileE2EWorkflow triggers the mobile E2E workflow via GitHub Actions API -func (s *Server) dispatchMobileE2EWorkflow(repoOwner, repoName, ref, sha, site1URL, site2URL, site3URL, platform, runType string) error { +// dispatchMobileE2EWorkflow triggers the mobile E2E workflow via GitHub Actions API. +// trackingKey is the s.e2eInstances map key for this run; when non-empty it is passed +// as the "mw_tracking_key" workflow input so the workflow_run completed handler can do +// a direct key lookup instead of fragile SHA suffix matching. +func (s *Server) dispatchMobileE2EWorkflow(repoOwner, repoName, ref, sha, site1URL, site2URL, site3URL, platform, runType, trackingKey string) error { ctx := context.Background() client := newGithubClient(s.Config.GithubAccessToken) @@ -1082,6 +1122,9 @@ func (s *Server) dispatchMobileE2EWorkflow(repoOwner, repoName, ref, sha, site1U "PLATFORM": platform, "run_type": runType, } + if trackingKey != "" { + workflowInputs["mw_tracking_key"] = trackingKey + } // Use REST API to trigger workflow dispatch (v32 go-github compatibility) req, err := client.NewRequest("POST", diff --git a/server/push_events.go b/server/push_events.go index 5e35099..ad05c80 100644 --- a/server/push_events.go +++ b/server/push_events.go @@ -122,8 +122,9 @@ func (s *Server) handlePushEventE2E(event *github.PushEvent, branch string, vers s.e2eInstances[key] = instances s.e2eInstancesLock.Unlock() - // Trigger the appropriate E2E workflow - err = s.triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, sha, instances) + // Trigger the appropriate E2E workflow, passing the tracking key so it reaches + // the workflow inputs as mw_tracking_key for reliable cleanup on completion. + err = s.triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, sha, key, instances) if err != nil { logger.WithError(err).Error("Failed to trigger E2E workflow") // Remove from tracking and destroy instances on dispatch failure @@ -233,8 +234,10 @@ func getRunnerForPlatform(platform string) string { } } -// triggerE2EWorkflowForPushEvent triggers the E2E workflow for a push event -func (s *Server) triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, sha string, instances []*E2EInstance) error { +// triggerE2EWorkflowForPushEvent triggers the E2E workflow for a push event. +// trackingKey is threaded through to the dispatch call so it appears as mw_tracking_key +// in the workflow inputs, enabling direct key-based cleanup on completion. +func (s *Server) triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, sha, trackingKey string, instances []*E2EInstance) error { logger := s.Logger.WithFields(logrus.Fields{ "repo": repoName, "instanceType": instanceType, @@ -250,14 +253,14 @@ func (s *Server) triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, } if instanceType == "desktop" { - return s.triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, instances) + return s.triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, trackingKey, instances) } - return s.triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, instances) + return s.triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, trackingKey, instances) } // triggerDesktopE2EWorkflowForPushEvent triggers the desktop E2E workflow -func (s *Server) triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha string, instances []*E2EInstance) error { +func (s *Server) triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, trackingKey string, instances []*E2EInstance) error { logger := s.Logger.WithFields(logrus.Fields{ "repo": repoName, "branch": branch, @@ -277,15 +280,11 @@ func (s *Server) triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, bran runType = "RELEASE" } - // Dispatch to the exact commit SHA so the workflow_run completed event carries the - // same head_sha we stored in the tracking key ({repo}-push-{branch}-{sha}). - // Using the branch ref risks a head_sha mismatch if another commit lands during - // the ~30 min instance-creation window. - return s.dispatchDesktopE2EWorkflow(repoOwner, repoName, sha, sha, instanceDetailsJSON, runType, false) + return s.dispatchDesktopE2EWorkflow(repoOwner, repoName, branch, sha, instanceDetailsJSON, runType, trackingKey, false) } // triggerMobileE2EWorkflowForPushEvent triggers the mobile E2E workflow (e2e-detox-pr.yml) -func (s *Server) triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha string, instances []*E2EInstance) error { +func (s *Server) triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, trackingKey string, instances []*E2EInstance) error { logger := s.Logger.WithFields(logrus.Fields{ "repo": repoName, "branch": branch, @@ -307,13 +306,11 @@ func (s *Server) triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branc runType = "RELEASE" } - // Dispatch to the exact commit SHA (not branch) so the workflow_run completed event - // carries the same head_sha stored in the tracking key ({repo}-push-{branch}-{sha}). return s.dispatchMobileE2EWorkflow( - repoOwner, repoName, sha, sha, + repoOwner, repoName, branch, sha, instances[0].URL, instances[1].URL, instances[2].URL, "both", // Push events (release/master) test both platforms - runType, + runType, trackingKey, ) } diff --git a/server/server.go b/server/server.go index c950571..c95202e 100644 --- a/server/server.go +++ b/server/server.go @@ -48,6 +48,13 @@ type Server struct { e2eInstances map[string][]*E2EInstance e2eInstancesLock sync.Mutex + // e2eInProgress guards against concurrent handleE2ETestRequest executions for the + // same PR key (e.g. duplicate webhook deliveries). Only one goroutine per key may + // run the check-and-create flow at a time; a second arrival while the first is + // still running is silently dropped. + e2eInProgress map[string]bool + e2eInProgressLock sync.Mutex + // githubAPIBase overrides the GitHub API base URL (e.g. "https://api.github.com/"). // When non-empty (tests only), GitHub clients created inside this server will be // redirected to this URL instead of the real GitHub API. @@ -87,8 +94,9 @@ func New(config *MatterwickConfig) *Server { StartTime: time.Now(), Logger: logger.WithField("instance", cloudModel.NewID()), CloudClient: cloudClient, - envMaps: make(map[string]cloudModel.EnvVarMap), - e2eInstances: make(map[string][]*E2EInstance), + envMaps: make(map[string]cloudModel.EnvVarMap), + e2eInstances: make(map[string][]*E2EInstance), + e2eInProgress: make(map[string]bool), } if !isAwsConfigDefined() { @@ -112,10 +120,23 @@ func New(config *MatterwickConfig) *Server { func (s *Server) Start() { s.Logger.Info("Starting MatterWick Server") - // Destroy any non-PR E2E instances left over from a previous run. - // Must run before the HTTP listener starts so that cleanup completes before - // new webhook events can arrive and re-create conflicting instances. - s.cleanupNonPRE2EInstancesOnStartup() + // Destroy stale non-PR E2E instances left from a previous run immediately on startup, + // then continue scanning periodically so a mid-run restart doesn't leave orphaned + // instances alive until the *next* matterwick restart. + // The scan interval is half the configured max-age so the worst-case orphan lifetime + // is maxAge + interval ≈ 1.5× maxAge. + s.cleanupStaleNonPRE2EInstances() + go func() { + interval := s.e2eInstanceMaxAge() / 2 + if interval < 30*time.Minute { + interval = 30 * time.Minute + } + ticker := time.NewTicker(interval) + defer ticker.Stop() + for range ticker.C { + s.cleanupStaleNonPRE2EInstances() + } + }() s.initializeRouter() diff --git a/server/workflow_run.go b/server/workflow_run.go index e0425ac..fc35b4a 100644 --- a/server/workflow_run.go +++ b/server/workflow_run.go @@ -129,9 +129,30 @@ func (s *Server) handleWorkflowRunEventWithInputs(payload *WorkflowRunWebhookPay return } - // --- Test workflow completion: sha-based cleanup for push events and nightly runs --- + // --- Test workflow completion: clean up provisioned instances --- if payload.Action == "completed" && s.isE2ETestWorkflow(workflowName) { - logger.Info("Test workflow completed, checking for sha-based instance cleanup") + logger.Info("Test workflow completed, checking for instance cleanup") + + // Primary path: direct tracking-key lookup using the key embedded in dispatch + // inputs. Immune to SHA mismatch (new commits during ~30 min provisioning window) + // and to runID collisions (two runs sharing the same branch HEAD sha). + // Reading a nil Inputs map in Go returns "" safely — no nil check required. + if trackingKey := payload.WorkflowRun.Inputs["mw_tracking_key"]; trackingKey != "" { + s.e2eInstancesLock.Lock() + instances := s.e2eInstances[trackingKey] + delete(s.e2eInstances, trackingKey) + s.e2eInstancesLock.Unlock() + if len(instances) > 0 { + logger.WithField("tracking_key", trackingKey).Info("Destroying instances by tracking key") + s.destroyE2EInstances(instances, logger) + } else { + logger.WithField("tracking_key", trackingKey).Debug("No in-memory instances for tracking key (matterwick restarted or already cleaned)") + } + return + } + + // Fallback: SHA-based scan for runs dispatched before mw_tracking_key was added. + logger.Debug("No mw_tracking_key in workflow inputs, falling back to SHA-based instance cleanup") s.findAndDestroyInstancesBySHA(repoName, headSHA, logger) return } @@ -201,8 +222,10 @@ func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha, triggerEv s.destroyE2EInstances(instances, logger) return } - // Dispatch to the exact SHA so workflow_run completed event matches the tracking key. - dispatchErr = s.dispatchDesktopE2EWorkflow(owner, repoName, sha, sha, instanceDetailsJSON, runType, nightly) + // Pass the tracking key so the workflow_run completed handler can clean up by + // direct key lookup rather than SHA suffix matching (immune to new commits during + // the ~30 min instance-creation window). + dispatchErr = s.dispatchDesktopE2EWorkflow(owner, repoName, branch, sha, instanceDetailsJSON, runType, key, nightly) } else { if len(instances) < 3 { logger.Errorf("Expected 3 mobile instances, got %d", len(instances)) @@ -212,9 +235,8 @@ func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha, triggerEv s.destroyE2EInstances(instances, logger) return } - // Dispatch to the exact SHA so workflow_run completed event matches the tracking key. - dispatchErr = s.dispatchMobileE2EWorkflow(owner, repoName, sha, sha, - instances[0].URL, instances[1].URL, instances[2].URL, "both", runType) + dispatchErr = s.dispatchMobileE2EWorkflow(owner, repoName, branch, sha, + instances[0].URL, instances[1].URL, instances[2].URL, "both", runType, key) } if dispatchErr != nil { @@ -357,9 +379,9 @@ func (s *Server) handleCMTWithServerVersions(repoOwner, repoName, instanceType, return } - // Dispatch to the exact SHA so the completed workflow_run event carries the same - // head_sha, allowing findAndDestroyInstancesBySHA to match the tracking key. - if err := s.dispatchCMTWorkflow(repoOwner, repoName, sha, branch, cmtMatrixJSON, instanceType, runID, logger); err != nil { + // Pass the tracking key so the workflow_run completed handler can clean up by + // direct key lookup rather than SHA suffix matching. + if err := s.dispatchCMTWorkflow(repoOwner, repoName, sha, branch, cmtMatrixJSON, instanceType, key, runID, logger); err != nil { logger.WithError(err).Error("Failed to dispatch compatibility-matrix-testing.yml") s.e2eInstancesLock.Lock() delete(s.e2eInstances, key) @@ -472,17 +494,19 @@ func buildMobileCMTMatrixJSON(versions []string, instances []*E2EInstance) (stri } // dispatchCMTWorkflow dispatches compatibility-matrix-testing.yml with the populated -// CMT_MATRIX JSON. Dispatches with ref=sha so the resulting workflow_run.head_sha matches -// the tracking key suffix, enabling sha-based cleanup on completion. +// CMT_MATRIX JSON. trackingKey is the s.e2eInstances map key for this run; it is +// embedded as "mw_tracking_key" in the workflow inputs so the workflow_run completed +// handler can do a direct key lookup instead of fragile SHA suffix matching. // runID is the CMT provisioner workflow run ID, passed as cmt_run_id so the test workflow // can call back to Matterwick for instance cleanup. -func (s *Server) dispatchCMTWorkflow(repoOwner, repoName, sha, branch, cmtMatrixJSON, instanceType string, runID int64, logger logrus.FieldLogger) error { +func (s *Server) dispatchCMTWorkflow(repoOwner, repoName, sha, branch, cmtMatrixJSON, instanceType, trackingKey string, runID int64, logger logrus.FieldLogger) error { ctx := context.Background() client := newGithubClient(s.Config.GithubAccessToken) workflowInputs := map[string]interface{}{ - "CMT_MATRIX": cmtMatrixJSON, - "cmt_run_id": fmt.Sprintf("%d", runID), + "CMT_MATRIX": cmtMatrixJSON, + "cmt_run_id": fmt.Sprintf("%d", runID), + "mw_tracking_key": trackingKey, } if instanceType == "desktop" { workflowInputs["DESKTOP_VERSION"] = branch @@ -491,14 +515,15 @@ func (s *Server) dispatchCMTWorkflow(repoOwner, repoName, sha, branch, cmtMatrix } logger.WithFields(logrus.Fields{ - "ref": sha, + "ref": branch, "instanceType": instanceType, }).Debug("Dispatching compatibility-matrix-testing.yml") + // GitHub workflow_dispatch requires a branch or tag name as ref, not a commit SHA. req, err := client.NewRequest("POST", fmt.Sprintf("/repos/%s/%s/actions/workflows/compatibility-matrix-testing.yml/dispatches", repoOwner, repoName), map[string]interface{}{ - "ref": sha, + "ref": branch, "inputs": workflowInputs, }) if err != nil { From 6302d49bab0512fed1b8a211b577ec8cbce390a5 Mon Sep 17 00:00:00 2001 From: yasserfaraazkhan Date: Thu, 16 Apr 2026 18:17:50 +0530 Subject: [PATCH 2/7] fix(e2e): Add E2EResetServersLabel, periodic cleanup, tracking key for reliable instance lifecycle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix 409 on PR E2E label: per-PR in-progress guard + nanosecond uid prevents duplicate webhook deliveries from racing to create conflicting installations - Fix nightly/push/release E2E not running: dispatch ref was commit SHA; GitHub workflow_dispatch requires branch name — all non-PR dispatch paths updated - Fix cleanup gaps: embed mw_tracking_key in dispatch inputs for direct key-based cleanup on workflow_run completion, eliminating SHA mismatch failures - Add E2EResetServersLabel: single label to destroy existing PR servers so developer can add E2E/Run to get a clean-state environment - Add E2EInstanceMaxAge config (default 3h) and periodic orphan scan (every maxAge/2) replacing startup-only 8h hardcoded threshold Co-Authored-By: Claude Sonnet 4.6 --- server/config.go | 1 + server/e2e_tests.go | 11 ----------- server/pull_request.go | 10 ++++++++-- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/server/config.go b/server/config.go index 0dc62eb..88b894d 100644 --- a/server/config.go +++ b/server/config.go @@ -108,6 +108,7 @@ type MatterwickConfig struct { E2ELabel string E2EMobileIOSLabel string E2EMobileAndroidLabel string + E2EResetServersLabel string E2EUsername string E2EPassword string E2EServerVersion string diff --git a/server/e2e_tests.go b/server/e2e_tests.go index 61f8fd8..a964672 100644 --- a/server/e2e_tests.go +++ b/server/e2e_tests.go @@ -669,17 +669,6 @@ func (s *Server) e2eInstanceMaxAge() time.Duration { return 3 * time.Hour } -// cleanupStaleNonPRE2EInstances destroys non-PR E2E instances (push/nightly/CMT) that -// are older than e2eInstanceMaxAge() and not already in a deletion state. -// -// Called on startup (to recover from a previous matterwick crash/restart that wiped the -// in-memory tracking map) and periodically during normal operation (so orphaned instances -// from a mid-run restart are cleaned up without waiting for the next matterwick restart). -// -// The age threshold prevents destroying instances that are still actively used by a test -// that started before the restart. Set E2EInstanceMaxAge to a value larger than the -// longest expected E2E test run duration. -// // PR instances (identified by "-pr-" in their OwnerID) are always skipped — handleE2ECleanup // on PR close manages their lifecycle via cloud-API orphan scan. func (s *Server) cleanupStaleNonPRE2EInstances() { diff --git a/server/pull_request.go b/server/pull_request.go index b638e4e..0f978e7 100644 --- a/server/pull_request.go +++ b/server/pull_request.go @@ -46,6 +46,12 @@ func (s *Server) handlePullRequestEvent(event *github.PullRequestEvent) { return } + if label == s.Config.E2EResetServersLabel { + logger.WithField("label", label).Info("PR received E2E reset-servers label, destroying existing servers") + go s.handleE2ECleanup(pr) + return + } + if s.isSpinWickLabel(label) { logger.WithField("label", label).Info("PR received SpinWick label") switch *event.Label.Name { @@ -160,14 +166,14 @@ func (s *Server) removeOldComments(comments []*github.IssueComment, pr *model.Pu } } -// isE2ELabel checks if a label is an E2E test label (desktop or mobile) +// isE2ELabel checks if a label is an E2E test label (desktop or mobile). func (s *Server) isE2ELabel(label string) bool { return label == s.Config.E2ELabel || label == s.Config.E2EMobileIOSLabel || label == s.Config.E2EMobileAndroidLabel } -// extractPlatformFromLabel determines the platform (ios/android/both) from the label +// extractPlatformFromLabel determines the platform (ios/android/both) from the label. func (s *Server) extractPlatformFromLabel(label string) string { switch label { case s.Config.E2EMobileIOSLabel: From a9fd9615cec928a646bc5096153c1700b436fca8 Mon Sep 17 00:00:00 2001 From: yasserfaraazkhan Date: Thu, 16 Apr 2026 18:20:26 +0530 Subject: [PATCH 3/7] fix: apply CodeRabbit review fixes for PR #88 - e2eUniqueSuffix: replace uint32(UnixNano) with cloudModel.NewID()[:8] (crypto/rand-based) to eliminate ~4.3s wrap-around collision risk - in-progress guard: include testPlatform in key so different mobile platform labels (ios vs android) are not incorrectly dropped as duplicates - push_events: fail fast and destroy instances when push event SHA is empty to avoid malformed tracking keys and empty MOBILE_VERSION dispatch - server: add stopCh to Server struct; periodic cleanup goroutine now observes Stop() via select on stopCh, exiting cleanly on shutdown Co-Authored-By: Claude Sonnet 4.6 --- server/e2e_tests.go | 26 ++++++++++++++------------ server/push_events.go | 9 +++++++++ server/server.go | 21 ++++++++++++++++----- 3 files changed, 39 insertions(+), 17 deletions(-) diff --git a/server/e2e_tests.go b/server/e2e_tests.go index a964672..c0fcce2 100644 --- a/server/e2e_tests.go +++ b/server/e2e_tests.go @@ -34,12 +34,11 @@ type E2EInstance struct { ServerVersion string `json:"server_version"` } -// e2eUniqueSuffix returns an 8-character hex suffix for instance name uniqueness. -// Uses the lower 32 bits of the nanosecond clock so that two calls within the -// same second produce different values, preventing 409 conflicts when duplicate -// webhook deliveries trigger concurrent instance creation. +// e2eUniqueSuffix returns an 8-character random hex suffix for instance name uniqueness. +// Uses cloudModel.NewID (crypto/rand-based UUID) truncated to 8 chars so that +// concurrent calls always produce distinct values regardless of clock resolution. func e2eUniqueSuffix() string { - return fmt.Sprintf("%08x", uint32(time.Now().UnixNano())) + return cloudModel.NewID()[:8] } // sanitizeForDNS lowercases and replaces non-DNS characters with hyphens. @@ -97,20 +96,23 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) { key := fmt.Sprintf("%s-pr-%d", pr.RepoName, pr.Number) - // Guard against duplicate webhook deliveries: if another goroutine is already - // handling this PR key (check, cloud lookup, or 30-min creation), drop the - // duplicate rather than racing to create conflicting cloud installations. + // Guard against duplicate webhook deliveries. The in-progress key includes + // the test platform so that a second mobile label with a *different* platform + // (e.g. E2E/Run-Android while E2E/Run-iOS is provisioning) is not incorrectly + // dropped — it will reuse the in-flight instances once they are stored, or + // create its own if they are not yet available. + inProgressKey := fmt.Sprintf("%s-%s", key, testPlatform) s.e2eInProgressLock.Lock() - if s.e2eInProgress[key] { + if s.e2eInProgress[inProgressKey] { s.e2eInProgressLock.Unlock() - logger.Warn("E2E instance creation already in progress for this PR, skipping duplicate request") + logger.Warn("E2E instance creation already in progress for this PR and platform, skipping duplicate request") return } - s.e2eInProgress[key] = true + s.e2eInProgress[inProgressKey] = true s.e2eInProgressLock.Unlock() defer func() { s.e2eInProgressLock.Lock() - delete(s.e2eInProgress, key) + delete(s.e2eInProgress, inProgressKey) s.e2eInProgressLock.Unlock() }() diff --git a/server/push_events.go b/server/push_events.go index ad05c80..17d612a 100644 --- a/server/push_events.go +++ b/server/push_events.go @@ -115,6 +115,15 @@ func (s *Server) handlePushEventE2E(event *github.PushEvent, branch string, vers logger.WithField("instanceCount", len(instances)).Info("E2E instances created successfully") + // Fail fast if the push event carried no commit SHA — an empty SHA would + // produce a malformed tracking key and send an empty MOBILE_VERSION to the + // test workflow. + if sha == "" { + logger.Error("Push event has no commit SHA, skipping E2E dispatch") + s.destroyE2EInstances(instances, logger) + return + } + // Track instances BEFORE dispatching so that a fast-completing workflow_run // completed event cannot race ahead of us and find nothing to clean up. key := fmt.Sprintf("%s-push-%s-%s", repoName, branch, sha) diff --git a/server/server.go b/server/server.go index c95202e..9b70269 100644 --- a/server/server.go +++ b/server/server.go @@ -49,12 +49,16 @@ type Server struct { e2eInstancesLock sync.Mutex // e2eInProgress guards against concurrent handleE2ETestRequest executions for the - // same PR key (e.g. duplicate webhook deliveries). Only one goroutine per key may - // run the check-and-create flow at a time; a second arrival while the first is - // still running is silently dropped. + // same PR+platform key (e.g. duplicate webhook deliveries). Only one goroutine per + // key may run the check-and-create flow at a time; a second arrival while the first + // is still running is silently dropped. e2eInProgress map[string]bool e2eInProgressLock sync.Mutex + // stopCh is closed by Stop() to signal long-running background goroutines + // (e.g. the periodic E2E cleanup ticker) to exit cleanly. + stopCh chan struct{} + // githubAPIBase overrides the GitHub API base URL (e.g. "https://api.github.com/"). // When non-empty (tests only), GitHub clients created inside this server will be // redirected to this URL instead of the real GitHub API. @@ -97,6 +101,7 @@ func New(config *MatterwickConfig) *Server { envMaps: make(map[string]cloudModel.EnvVarMap), e2eInstances: make(map[string][]*E2EInstance), e2eInProgress: make(map[string]bool), + stopCh: make(chan struct{}), } if !isAwsConfigDefined() { @@ -133,8 +138,13 @@ func (s *Server) Start() { } ticker := time.NewTicker(interval) defer ticker.Stop() - for range ticker.C { - s.cleanupStaleNonPRE2EInstances() + for { + select { + case <-ticker.C: + s.cleanupStaleNonPRE2EInstances() + case <-s.stopCh: + return + } } }() @@ -154,6 +164,7 @@ func (s *Server) Start() { // Stop stops a server func (s *Server) Stop() { s.Logger.Info("Stopping MatterWick") + close(s.stopCh) manners.Close() } From d83b8bccf58115691d4758b9d8f6b073525b585c Mon Sep 17 00:00:00 2001 From: yasserfaraazkhan Date: Thu, 16 Apr 2026 18:31:17 +0530 Subject: [PATCH 4/7] fix: guard handleE2ETestRequest against concurrent E2EResetServersLabel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a per-PR cleanup generation counter to prevent a race where handleE2ECleanup (triggered by E2EResetServersLabel) destroys cloud installations while handleE2ETestRequest is still provisioning, causing the provisioning goroutine to later store deleted instances in the tracking map and dispatch a workflow against non-existent servers. handleE2ECleanup increments e2ePRCleanupGeneration[key] before destroying. handleE2ETestRequest snapshots the counter before provisioning starts and checks it after creation completes — if the counter advanced, the fresh instances are discarded instead of being stored and dispatched. Also skips duplicate-comment findings (uid and in-progress guard) which were already fixed in the previous commit. Co-Authored-By: Claude Sonnet 4.6 --- server/e2e_tests.go | 32 +++++++++++++++++++++++++++++++- server/server.go | 16 ++++++++++++---- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/server/e2e_tests.go b/server/e2e_tests.go index c0fcce2..8f3601c 100644 --- a/server/e2e_tests.go +++ b/server/e2e_tests.go @@ -96,6 +96,13 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) { key := fmt.Sprintf("%s-pr-%d", pr.RepoName, pr.Number) + // Snapshot the cleanup generation before provisioning begins. If handleE2ECleanup + // fires while the ~30 min creation is in flight, it increments this counter. + // We re-check below before storing instances so we never write stale entries. + s.e2ePRCleanupGenerationLock.Lock() + startGeneration := s.e2ePRCleanupGeneration[key] + s.e2ePRCleanupGenerationLock.Unlock() + // Guard against duplicate webhook deliveries. The in-progress key includes // the test platform so that a second mobile label with a *different* platform // (e.g. E2E/Run-Android while E2E/Run-iOS is provisioning) is not incorrectly @@ -174,6 +181,19 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) { return } + // Check whether E2EResetServersLabel was applied while provisioning was in flight. + // If the cleanup generation advanced, handleE2ECleanup already deleted the cloud + // installations; storing them here would put stale, deleted instances into the + // tracking map and dispatch a workflow against non-existent servers. + s.e2ePRCleanupGenerationLock.Lock() + resetDuringProvisioning := s.e2ePRCleanupGeneration[key] != startGeneration + s.e2ePRCleanupGenerationLock.Unlock() + if resetDuringProvisioning { + logger.Warn("E2E reset was requested during provisioning; discarding freshly created instances") + s.destroyE2EInstances(instances, logger) + return + } + s.e2eInstancesLock.Lock() s.e2eInstances[key] = instances s.e2eInstancesLock.Unlock() @@ -597,8 +617,18 @@ func (s *Server) handleE2ECleanup(pr *model.PullRequest) { }) logger.Info("Handling E2E cleanup request") - // Fast path: in-memory map key := fmt.Sprintf("%s-pr-%d", pr.RepoName, pr.Number) + + // Increment the cleanup generation counter *before* destroying anything. + // handleE2ETestRequest captures this value before provisioning starts and + // checks it again after the ~30 min creation window. If the counter advanced, + // provisioning discards its result instead of writing stale instances to the + // tracking map and dispatching a workflow against already-deleted servers. + s.e2ePRCleanupGenerationLock.Lock() + s.e2ePRCleanupGeneration[key]++ + s.e2ePRCleanupGenerationLock.Unlock() + + // Fast path: in-memory map s.e2eInstancesLock.Lock() instances := s.e2eInstances[key] delete(s.e2eInstances, key) diff --git a/server/server.go b/server/server.go index 9b70269..f1a155a 100644 --- a/server/server.go +++ b/server/server.go @@ -55,6 +55,13 @@ type Server struct { e2eInProgress map[string]bool e2eInProgressLock sync.Mutex + // e2ePRCleanupGeneration tracks how many times handleE2ECleanup has run for + // each PR key. handleE2ETestRequest captures the counter before provisioning + // and aborts if it has changed when provisioning completes, preventing stale + // instances from being stored after a concurrent reset. + e2ePRCleanupGeneration map[string]int64 + e2ePRCleanupGenerationLock sync.Mutex + // stopCh is closed by Stop() to signal long-running background goroutines // (e.g. the periodic E2E cleanup ticker) to exit cleanly. stopCh chan struct{} @@ -98,10 +105,11 @@ func New(config *MatterwickConfig) *Server { StartTime: time.Now(), Logger: logger.WithField("instance", cloudModel.NewID()), CloudClient: cloudClient, - envMaps: make(map[string]cloudModel.EnvVarMap), - e2eInstances: make(map[string][]*E2EInstance), - e2eInProgress: make(map[string]bool), - stopCh: make(chan struct{}), + envMaps: make(map[string]cloudModel.EnvVarMap), + e2eInstances: make(map[string][]*E2EInstance), + e2eInProgress: make(map[string]bool), + e2ePRCleanupGeneration: make(map[string]int64), + stopCh: make(chan struct{}), } if !isAwsConfigDefined() { From c9d2779ddd98940d4be2c5dfdd3401583c0cac10 Mon Sep 17 00:00:00 2001 From: yasserfaraazkhan Date: Thu, 16 Apr 2026 18:32:20 +0530 Subject: [PATCH 5/7] fix: move empty-SHA guard before instance provisioning in handlePushEventE2E Previously the check fired after createMultipleE2EInstancesForPushEvent, meaning 3 cloud installations were provisioned and then immediately destroyed when the push event carried no commit SHA. Move the guard to before the provisioning call so no instances are ever created for empty-SHA events, and remove the now-unnecessary destroyE2EInstances call. Co-Authored-By: Claude Sonnet 4.6 --- server/push_events.go | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/server/push_events.go b/server/push_events.go index 17d612a..af55513 100644 --- a/server/push_events.go +++ b/server/push_events.go @@ -99,6 +99,13 @@ func (s *Server) handlePushEventE2E(event *github.PushEvent, branch string, vers instanceType = "mobile" } + // Validate SHA before provisioning — an empty SHA would produce a malformed + // tracking key and send an empty MOBILE_VERSION to the test workflow. + if sha == "" { + logger.Error("Push event has no commit SHA, skipping E2E dispatch") + return + } + logger.WithField("instanceType", instanceType).Info("Creating E2E instances for push event") // Create instances based on repo type @@ -115,15 +122,6 @@ func (s *Server) handlePushEventE2E(event *github.PushEvent, branch string, vers logger.WithField("instanceCount", len(instances)).Info("E2E instances created successfully") - // Fail fast if the push event carried no commit SHA — an empty SHA would - // produce a malformed tracking key and send an empty MOBILE_VERSION to the - // test workflow. - if sha == "" { - logger.Error("Push event has no commit SHA, skipping E2E dispatch") - s.destroyE2EInstances(instances, logger) - return - } - // Track instances BEFORE dispatching so that a fast-completing workflow_run // completed event cannot race ahead of us and find nothing to clean up. key := fmt.Sprintf("%s-push-%s-%s", repoName, branch, sha) From 681094e7912364db1457aacb4d34ab0cf583dc11 Mon Sep 17 00:00:00 2001 From: yasserfaraazkhan Date: Thu, 16 Apr 2026 20:04:56 +0530 Subject: [PATCH 6/7] fix: prune e2ePRCleanupGeneration on cleanup and guard Stop against double-close Delete the PR key from e2ePRCleanupGeneration at the end of handleE2ECleanup to prevent the map growing unbounded (one entry per cleaned-up PR forever). Guard Server.Stop against a second call by routing close(s.stopCh) through a sync.Once field, eliminating the panic on "close of closed channel". Co-Authored-By: Claude Sonnet 4.6 (1M context) --- server/e2e_tests.go | 7 +++++++ server/server.go | 5 +++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/server/e2e_tests.go b/server/e2e_tests.go index 8f3601c..5be0497 100644 --- a/server/e2e_tests.go +++ b/server/e2e_tests.go @@ -641,6 +641,13 @@ func (s *Server) handleE2ECleanup(pr *model.PullRequest) { // Fallback: catch orphans from restarts, map overwrites, or failed goroutines s.cleanupOrphanedE2EInstances(pr, logger) + + // Remove the generation counter for this key now that cleanup is complete. + // Leaving stale entries would cause e2ePRCleanupGeneration to grow without + // bound across the lifetime of the server (one entry per cleaned-up PR). + s.e2ePRCleanupGenerationLock.Lock() + delete(s.e2ePRCleanupGeneration, key) + s.e2ePRCleanupGenerationLock.Unlock() } // cleanupOrphanedE2EInstances queries the cloud API by DNS LIKE pattern and destroys any matches. diff --git a/server/server.go b/server/server.go index f1a155a..f1e5cdf 100644 --- a/server/server.go +++ b/server/server.go @@ -64,7 +64,8 @@ type Server struct { // stopCh is closed by Stop() to signal long-running background goroutines // (e.g. the periodic E2E cleanup ticker) to exit cleanly. - stopCh chan struct{} + stopCh chan struct{} + stopOnce sync.Once // githubAPIBase overrides the GitHub API base URL (e.g. "https://api.github.com/"). // When non-empty (tests only), GitHub clients created inside this server will be @@ -172,7 +173,7 @@ func (s *Server) Start() { // Stop stops a server func (s *Server) Stop() { s.Logger.Info("Stopping MatterWick") - close(s.stopCh) + s.stopOnce.Do(func() { close(s.stopCh) }) manners.Close() } From 0b712aa0b386f6ad8b4947b23a0206d6aee79107 Mon Sep 17 00:00:00 2001 From: yasserfaraazkhan Date: Thu, 16 Apr 2026 20:12:53 +0530 Subject: [PATCH 7/7] fix: revert e2ePRCleanupGeneration deletion to preserve monotonic semantics Deleting the key after cleanup reintroduces the zero-value ambiguity: a provisioning run that started before any cleanup captures startGeneration=0, then cleanup increments to 1 and deletes the key (value silently back to 0), so the final check 0!=0 is false and stale deleted instances get stored. Keep the counter monotonic (never delete) so the != comparison always fires when cleanup ran during provisioning. The map holds at most one int64 per PR key ever cleaned up, which is negligible. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- server/e2e_tests.go | 7 ------- 1 file changed, 7 deletions(-) diff --git a/server/e2e_tests.go b/server/e2e_tests.go index 5be0497..8f3601c 100644 --- a/server/e2e_tests.go +++ b/server/e2e_tests.go @@ -641,13 +641,6 @@ func (s *Server) handleE2ECleanup(pr *model.PullRequest) { // Fallback: catch orphans from restarts, map overwrites, or failed goroutines s.cleanupOrphanedE2EInstances(pr, logger) - - // Remove the generation counter for this key now that cleanup is complete. - // Leaving stale entries would cause e2ePRCleanupGeneration to grow without - // bound across the lifetime of the server (one entry per cleaned-up PR). - s.e2ePRCleanupGenerationLock.Lock() - delete(s.e2ePRCleanupGeneration, key) - s.e2ePRCleanupGenerationLock.Unlock() } // cleanupOrphanedE2EInstances queries the cloud API by DNS LIKE pattern and destroys any matches.