diff --git a/server/config.go b/server/config.go index af5d187..88b894d 100644 --- a/server/config.go +++ b/server/config.go @@ -108,6 +108,7 @@ type MatterwickConfig struct { E2ELabel string E2EMobileIOSLabel string E2EMobileAndroidLabel string + E2EResetServersLabel string E2EUsername string E2EPassword string E2EServerVersion string @@ -116,6 +117,12 @@ type MatterwickConfig struct { E2EReleasePatternPrefix string E2ENightlyTriggerWorkflowName string // workflow name (name: field) of the nightly trigger workflow E2ETestWorkflowNames []string // workflow names of the actual test workflows (for completion-based cleanup) + // E2EInstanceMaxAge is the minimum age (in hours) a non-PR E2E instance must reach + // before the periodic orphan-cleanup scan will delete it. This prevents the scan + // from destroying instances that are still being used by a currently-running test. + // Set to the longest expected E2E run duration plus a small buffer. + // Default (0): 3 hours. + E2EInstanceMaxAge int } func findConfigFile(fileName string) string { diff --git a/server/e2e_tests.go b/server/e2e_tests.go index ec44b3f..8f3601c 100644 --- a/server/e2e_tests.go +++ b/server/e2e_tests.go @@ -34,9 +34,11 @@ type E2EInstance struct { ServerVersion string `json:"server_version"` } -// e2eUniqueSuffix returns an 8-character hex timestamp for instance name uniqueness. +// e2eUniqueSuffix returns an 8-character random hex suffix for instance name uniqueness. +// Uses cloudModel.NewID (crypto/rand-based UUID) truncated to 8 chars so that +// concurrent calls always produce distinct values regardless of clock resolution. func e2eUniqueSuffix() string { - return fmt.Sprintf("%08x", time.Now().Unix()) + return cloudModel.NewID()[:8] } // sanitizeForDNS lowercases and replaces non-DNS characters with hyphens. @@ -94,6 +96,33 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) { key := fmt.Sprintf("%s-pr-%d", pr.RepoName, pr.Number) + // Snapshot the cleanup generation before provisioning begins. If handleE2ECleanup + // fires while the ~30 min creation is in flight, it increments this counter. + // We re-check below before storing instances so we never write stale entries. + s.e2ePRCleanupGenerationLock.Lock() + startGeneration := s.e2ePRCleanupGeneration[key] + s.e2ePRCleanupGenerationLock.Unlock() + + // Guard against duplicate webhook deliveries. The in-progress key includes + // the test platform so that a second mobile label with a *different* platform + // (e.g. E2E/Run-Android while E2E/Run-iOS is provisioning) is not incorrectly + // dropped — it will reuse the in-flight instances once they are stored, or + // create its own if they are not yet available. + inProgressKey := fmt.Sprintf("%s-%s", key, testPlatform) + s.e2eInProgressLock.Lock() + if s.e2eInProgress[inProgressKey] { + s.e2eInProgressLock.Unlock() + logger.Warn("E2E instance creation already in progress for this PR and platform, skipping duplicate request") + return + } + s.e2eInProgress[inProgressKey] = true + s.e2eInProgressLock.Unlock() + defer func() { + s.e2eInProgressLock.Lock() + delete(s.e2eInProgress, inProgressKey) + s.e2eInProgressLock.Unlock() + }() + // 1. Reuse existing in-memory instances (servers stay alive between label toggles). s.e2eInstancesLock.Lock() existingInstances := s.e2eInstances[key] @@ -152,6 +181,19 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) { return } + // Check whether E2EResetServersLabel was applied while provisioning was in flight. + // If the cleanup generation advanced, handleE2ECleanup already deleted the cloud + // installations; storing them here would put stale, deleted instances into the + // tracking map and dispatch a workflow against non-existent servers. + s.e2ePRCleanupGenerationLock.Lock() + resetDuringProvisioning := s.e2ePRCleanupGeneration[key] != startGeneration + s.e2ePRCleanupGenerationLock.Unlock() + if resetDuringProvisioning { + logger.Warn("E2E reset was requested during provisioning; discarding freshly created instances") + s.destroyE2EInstances(instances, logger) + return + } + s.e2eInstancesLock.Lock() s.e2eInstances[key] = instances s.e2eInstancesLock.Unlock() @@ -575,8 +617,18 @@ func (s *Server) handleE2ECleanup(pr *model.PullRequest) { }) logger.Info("Handling E2E cleanup request") - // Fast path: in-memory map key := fmt.Sprintf("%s-pr-%d", pr.RepoName, pr.Number) + + // Increment the cleanup generation counter *before* destroying anything. + // handleE2ETestRequest captures this value before provisioning starts and + // checks it again after the ~30 min creation window. If the counter advanced, + // provisioning discards its result instead of writing stale instances to the + // tracking map and dispatching a workflow against already-deleted servers. + s.e2ePRCleanupGenerationLock.Lock() + s.e2ePRCleanupGeneration[key]++ + s.e2ePRCleanupGenerationLock.Unlock() + + // Fast path: in-memory map s.e2eInstancesLock.Lock() instances := s.e2eInstances[key] delete(s.e2eInstances, key) @@ -639,21 +691,22 @@ func (s *Server) cleanupOrphanedE2EInstances(pr *model.PullRequest, logger logru } } -// cleanupNonPRE2EInstancesOnStartup destroys stale non-PR E2E instances left over from a -// previous matterwick run. When matterwick restarts, the in-memory tracking map is wiped, -// making push/nightly/CMT instances untrackable — the workflow_run(completed) cleanup path -// will find nothing. A 8-hour age threshold is used so that instances still being actively -// used by tests that started before the restart are NOT destroyed mid-run. -// -// Reasoning: E2E tests (nightly, push, CMT) take at most a few hours. Any non-PR instance -// older than 8 hours whose tracking was lost must be orphaned. -// +// e2eInstanceMaxAge returns the configured maximum age for non-PR E2E instances before +// they are considered orphaned and eligible for deletion by the periodic cleanup scan. +// Falls back to 3 hours when the config value is 0 (unset). +func (s *Server) e2eInstanceMaxAge() time.Duration { + if s.Config.E2EInstanceMaxAge > 0 { + return time.Duration(s.Config.E2EInstanceMaxAge) * time.Hour + } + return 3 * time.Hour +} + // PR instances (identified by "-pr-" in their OwnerID) are always skipped — handleE2ECleanup -// on PR close, which includes DNS orphan cleanup, manages their lifecycle. -func (s *Server) cleanupNonPRE2EInstancesOnStartup() { - const maxAge = 8 * time.Hour - logger := s.Logger.WithField("type", "startup_e2e_cleanup") - logger.WithField("max_age_hours", 8).Info("Scanning for stale non-PR E2E instances from previous matterwick run") +// on PR close manages their lifecycle via cloud-API orphan scan. +func (s *Server) cleanupStaleNonPRE2EInstances() { + maxAge := s.e2eInstanceMaxAge() + logger := s.Logger.WithField("type", "periodic_e2e_cleanup") + logger.WithField("max_age_hours", maxAge.Hours()).Info("Scanning for stale non-PR E2E instances") cutoffMs := time.Now().Add(-maxAge).UnixMilli() @@ -664,7 +717,7 @@ func (s *Server) cleanupNonPRE2EInstancesOnStartup() { Paging: cloudModel.AllPagesNotDeleted(), }) if err != nil { - logger.WithError(err).Errorf("Failed to query %s instances on startup", instanceType) + logger.WithError(err).Errorf("Failed to query %s instances", instanceType) continue } @@ -672,7 +725,7 @@ func (s *Server) cleanupNonPRE2EInstancesOnStartup() { // Guard against a nil embedded Installation — must come first, all other // field accesses (OwnerID, CreateAt, State, ID) are on the embedded struct. if inst.Installation == nil { - logger.Warn("Skipping instance with nil Installation pointer in startup cleanup") + logger.Warn("Skipping instance with nil Installation pointer in cleanup scan") continue } @@ -682,13 +735,12 @@ func (s *Server) cleanupNonPRE2EInstancesOnStartup() { continue } - // Skip instances created within the last 8 hours — tests may still be running. - // If matterwick restarted mid-test, destroying active test instances would break them. + // Skip instances younger than maxAge — a test may still be using them. if inst.CreateAt > cutoffMs { logger.WithFields(logrus.Fields{ "installation_id": inst.ID, "owner_id": inst.OwnerID, - }).Debug("Skipping non-PR instance younger than 8h (may still be in use)") + }).Debug("Skipping non-PR instance younger than max age (may still be in use)") continue } @@ -707,14 +759,14 @@ func (s *Server) cleanupNonPRE2EInstancesOnStartup() { "owner_id": inst.OwnerID, "state": inst.State, }) - instLogger.Warn("Destroying stale non-PR E2E instance (older than 8h) left from previous matterwick run") + instLogger.Warn("Destroying stale non-PR E2E instance") if err := s.CloudClient.DeleteInstallation(inst.ID); err != nil { instLogger.WithError(err).Error("Failed to destroy stale non-PR E2E instance") } } } - logger.Info("Startup non-PR E2E instance cleanup complete") + logger.Info("Non-PR E2E instance cleanup scan complete") } // resolveE2EServerVersion returns the Mattermost server version to use for E2E instances. @@ -1001,8 +1053,11 @@ func (s *Server) buildInstanceDetailsJSON(instances []*E2EInstance) (string, err return string(jsonBytes), nil } -// dispatchDesktopE2EWorkflow triggers the desktop E2E workflow via GitHub Actions API -func (s *Server) dispatchDesktopE2EWorkflow(repoOwner, repoName, ref, sha, instanceDetailsJSON, runType string, nightly bool) error { +// dispatchDesktopE2EWorkflow triggers the desktop E2E workflow via GitHub Actions API. +// trackingKey is the s.e2eInstances map key for this run; when non-empty it is passed +// as the "mw_tracking_key" workflow input so the workflow_run completed handler can do +// a direct key lookup instead of fragile SHA suffix matching. +func (s *Server) dispatchDesktopE2EWorkflow(repoOwner, repoName, ref, sha, instanceDetailsJSON, runType, trackingKey string, nightly bool) error { ctx := context.Background() client := newGithubClient(s.Config.GithubAccessToken) @@ -1035,6 +1090,9 @@ func (s *Server) dispatchDesktopE2EWorkflow(repoOwner, repoName, ref, sha, insta "run_type": runType, "nightly": fmt.Sprintf("%t", nightly), } + if trackingKey != "" { + workflowInputs["mw_tracking_key"] = trackingKey + } // Use REST API to trigger workflow dispatch (v32 go-github compatibility) req, err := client.NewRequest("POST", @@ -1063,8 +1121,11 @@ func (s *Server) dispatchDesktopE2EWorkflow(repoOwner, repoName, ref, sha, insta return nil } -// dispatchMobileE2EWorkflow triggers the mobile E2E workflow via GitHub Actions API -func (s *Server) dispatchMobileE2EWorkflow(repoOwner, repoName, ref, sha, site1URL, site2URL, site3URL, platform, runType string) error { +// dispatchMobileE2EWorkflow triggers the mobile E2E workflow via GitHub Actions API. +// trackingKey is the s.e2eInstances map key for this run; when non-empty it is passed +// as the "mw_tracking_key" workflow input so the workflow_run completed handler can do +// a direct key lookup instead of fragile SHA suffix matching. +func (s *Server) dispatchMobileE2EWorkflow(repoOwner, repoName, ref, sha, site1URL, site2URL, site3URL, platform, runType, trackingKey string) error { ctx := context.Background() client := newGithubClient(s.Config.GithubAccessToken) @@ -1082,6 +1143,9 @@ func (s *Server) dispatchMobileE2EWorkflow(repoOwner, repoName, ref, sha, site1U "PLATFORM": platform, "run_type": runType, } + if trackingKey != "" { + workflowInputs["mw_tracking_key"] = trackingKey + } // Use REST API to trigger workflow dispatch (v32 go-github compatibility) req, err := client.NewRequest("POST", diff --git a/server/pull_request.go b/server/pull_request.go index b638e4e..0f978e7 100644 --- a/server/pull_request.go +++ b/server/pull_request.go @@ -46,6 +46,12 @@ func (s *Server) handlePullRequestEvent(event *github.PullRequestEvent) { return } + if label == s.Config.E2EResetServersLabel { + logger.WithField("label", label).Info("PR received E2E reset-servers label, destroying existing servers") + go s.handleE2ECleanup(pr) + return + } + if s.isSpinWickLabel(label) { logger.WithField("label", label).Info("PR received SpinWick label") switch *event.Label.Name { @@ -160,14 +166,14 @@ func (s *Server) removeOldComments(comments []*github.IssueComment, pr *model.Pu } } -// isE2ELabel checks if a label is an E2E test label (desktop or mobile) +// isE2ELabel checks if a label is an E2E test label (desktop or mobile). func (s *Server) isE2ELabel(label string) bool { return label == s.Config.E2ELabel || label == s.Config.E2EMobileIOSLabel || label == s.Config.E2EMobileAndroidLabel } -// extractPlatformFromLabel determines the platform (ios/android/both) from the label +// extractPlatformFromLabel determines the platform (ios/android/both) from the label. func (s *Server) extractPlatformFromLabel(label string) string { switch label { case s.Config.E2EMobileIOSLabel: diff --git a/server/push_events.go b/server/push_events.go index 5e35099..af55513 100644 --- a/server/push_events.go +++ b/server/push_events.go @@ -99,6 +99,13 @@ func (s *Server) handlePushEventE2E(event *github.PushEvent, branch string, vers instanceType = "mobile" } + // Validate SHA before provisioning — an empty SHA would produce a malformed + // tracking key and send an empty MOBILE_VERSION to the test workflow. + if sha == "" { + logger.Error("Push event has no commit SHA, skipping E2E dispatch") + return + } + logger.WithField("instanceType", instanceType).Info("Creating E2E instances for push event") // Create instances based on repo type @@ -122,8 +129,9 @@ func (s *Server) handlePushEventE2E(event *github.PushEvent, branch string, vers s.e2eInstances[key] = instances s.e2eInstancesLock.Unlock() - // Trigger the appropriate E2E workflow - err = s.triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, sha, instances) + // Trigger the appropriate E2E workflow, passing the tracking key so it reaches + // the workflow inputs as mw_tracking_key for reliable cleanup on completion. + err = s.triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, sha, key, instances) if err != nil { logger.WithError(err).Error("Failed to trigger E2E workflow") // Remove from tracking and destroy instances on dispatch failure @@ -233,8 +241,10 @@ func getRunnerForPlatform(platform string) string { } } -// triggerE2EWorkflowForPushEvent triggers the E2E workflow for a push event -func (s *Server) triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, sha string, instances []*E2EInstance) error { +// triggerE2EWorkflowForPushEvent triggers the E2E workflow for a push event. +// trackingKey is threaded through to the dispatch call so it appears as mw_tracking_key +// in the workflow inputs, enabling direct key-based cleanup on completion. +func (s *Server) triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, sha, trackingKey string, instances []*E2EInstance) error { logger := s.Logger.WithFields(logrus.Fields{ "repo": repoName, "instanceType": instanceType, @@ -250,14 +260,14 @@ func (s *Server) triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, } if instanceType == "desktop" { - return s.triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, instances) + return s.triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, trackingKey, instances) } - return s.triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, instances) + return s.triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, trackingKey, instances) } // triggerDesktopE2EWorkflowForPushEvent triggers the desktop E2E workflow -func (s *Server) triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha string, instances []*E2EInstance) error { +func (s *Server) triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, trackingKey string, instances []*E2EInstance) error { logger := s.Logger.WithFields(logrus.Fields{ "repo": repoName, "branch": branch, @@ -277,15 +287,11 @@ func (s *Server) triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, bran runType = "RELEASE" } - // Dispatch to the exact commit SHA so the workflow_run completed event carries the - // same head_sha we stored in the tracking key ({repo}-push-{branch}-{sha}). - // Using the branch ref risks a head_sha mismatch if another commit lands during - // the ~30 min instance-creation window. - return s.dispatchDesktopE2EWorkflow(repoOwner, repoName, sha, sha, instanceDetailsJSON, runType, false) + return s.dispatchDesktopE2EWorkflow(repoOwner, repoName, branch, sha, instanceDetailsJSON, runType, trackingKey, false) } // triggerMobileE2EWorkflowForPushEvent triggers the mobile E2E workflow (e2e-detox-pr.yml) -func (s *Server) triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha string, instances []*E2EInstance) error { +func (s *Server) triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, trackingKey string, instances []*E2EInstance) error { logger := s.Logger.WithFields(logrus.Fields{ "repo": repoName, "branch": branch, @@ -307,13 +313,11 @@ func (s *Server) triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branc runType = "RELEASE" } - // Dispatch to the exact commit SHA (not branch) so the workflow_run completed event - // carries the same head_sha stored in the tracking key ({repo}-push-{branch}-{sha}). return s.dispatchMobileE2EWorkflow( - repoOwner, repoName, sha, sha, + repoOwner, repoName, branch, sha, instances[0].URL, instances[1].URL, instances[2].URL, "both", // Push events (release/master) test both platforms - runType, + runType, trackingKey, ) } diff --git a/server/server.go b/server/server.go index c950571..f1e5cdf 100644 --- a/server/server.go +++ b/server/server.go @@ -48,6 +48,25 @@ type Server struct { e2eInstances map[string][]*E2EInstance e2eInstancesLock sync.Mutex + // e2eInProgress guards against concurrent handleE2ETestRequest executions for the + // same PR+platform key (e.g. duplicate webhook deliveries). Only one goroutine per + // key may run the check-and-create flow at a time; a second arrival while the first + // is still running is silently dropped. + e2eInProgress map[string]bool + e2eInProgressLock sync.Mutex + + // e2ePRCleanupGeneration tracks how many times handleE2ECleanup has run for + // each PR key. handleE2ETestRequest captures the counter before provisioning + // and aborts if it has changed when provisioning completes, preventing stale + // instances from being stored after a concurrent reset. + e2ePRCleanupGeneration map[string]int64 + e2ePRCleanupGenerationLock sync.Mutex + + // stopCh is closed by Stop() to signal long-running background goroutines + // (e.g. the periodic E2E cleanup ticker) to exit cleanly. + stopCh chan struct{} + stopOnce sync.Once + // githubAPIBase overrides the GitHub API base URL (e.g. "https://api.github.com/"). // When non-empty (tests only), GitHub clients created inside this server will be // redirected to this URL instead of the real GitHub API. @@ -87,8 +106,11 @@ func New(config *MatterwickConfig) *Server { StartTime: time.Now(), Logger: logger.WithField("instance", cloudModel.NewID()), CloudClient: cloudClient, - envMaps: make(map[string]cloudModel.EnvVarMap), - e2eInstances: make(map[string][]*E2EInstance), + envMaps: make(map[string]cloudModel.EnvVarMap), + e2eInstances: make(map[string][]*E2EInstance), + e2eInProgress: make(map[string]bool), + e2ePRCleanupGeneration: make(map[string]int64), + stopCh: make(chan struct{}), } if !isAwsConfigDefined() { @@ -112,10 +134,28 @@ func New(config *MatterwickConfig) *Server { func (s *Server) Start() { s.Logger.Info("Starting MatterWick Server") - // Destroy any non-PR E2E instances left over from a previous run. - // Must run before the HTTP listener starts so that cleanup completes before - // new webhook events can arrive and re-create conflicting instances. - s.cleanupNonPRE2EInstancesOnStartup() + // Destroy stale non-PR E2E instances left from a previous run immediately on startup, + // then continue scanning periodically so a mid-run restart doesn't leave orphaned + // instances alive until the *next* matterwick restart. + // The scan interval is half the configured max-age so the worst-case orphan lifetime + // is maxAge + interval ≈ 1.5× maxAge. + s.cleanupStaleNonPRE2EInstances() + go func() { + interval := s.e2eInstanceMaxAge() / 2 + if interval < 30*time.Minute { + interval = 30 * time.Minute + } + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-ticker.C: + s.cleanupStaleNonPRE2EInstances() + case <-s.stopCh: + return + } + } + }() s.initializeRouter() @@ -133,6 +173,7 @@ func (s *Server) Start() { // Stop stops a server func (s *Server) Stop() { s.Logger.Info("Stopping MatterWick") + s.stopOnce.Do(func() { close(s.stopCh) }) manners.Close() } diff --git a/server/workflow_run.go b/server/workflow_run.go index e0425ac..fc35b4a 100644 --- a/server/workflow_run.go +++ b/server/workflow_run.go @@ -129,9 +129,30 @@ func (s *Server) handleWorkflowRunEventWithInputs(payload *WorkflowRunWebhookPay return } - // --- Test workflow completion: sha-based cleanup for push events and nightly runs --- + // --- Test workflow completion: clean up provisioned instances --- if payload.Action == "completed" && s.isE2ETestWorkflow(workflowName) { - logger.Info("Test workflow completed, checking for sha-based instance cleanup") + logger.Info("Test workflow completed, checking for instance cleanup") + + // Primary path: direct tracking-key lookup using the key embedded in dispatch + // inputs. Immune to SHA mismatch (new commits during ~30 min provisioning window) + // and to runID collisions (two runs sharing the same branch HEAD sha). + // Reading a nil Inputs map in Go returns "" safely — no nil check required. + if trackingKey := payload.WorkflowRun.Inputs["mw_tracking_key"]; trackingKey != "" { + s.e2eInstancesLock.Lock() + instances := s.e2eInstances[trackingKey] + delete(s.e2eInstances, trackingKey) + s.e2eInstancesLock.Unlock() + if len(instances) > 0 { + logger.WithField("tracking_key", trackingKey).Info("Destroying instances by tracking key") + s.destroyE2EInstances(instances, logger) + } else { + logger.WithField("tracking_key", trackingKey).Debug("No in-memory instances for tracking key (matterwick restarted or already cleaned)") + } + return + } + + // Fallback: SHA-based scan for runs dispatched before mw_tracking_key was added. + logger.Debug("No mw_tracking_key in workflow inputs, falling back to SHA-based instance cleanup") s.findAndDestroyInstancesBySHA(repoName, headSHA, logger) return } @@ -201,8 +222,10 @@ func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha, triggerEv s.destroyE2EInstances(instances, logger) return } - // Dispatch to the exact SHA so workflow_run completed event matches the tracking key. - dispatchErr = s.dispatchDesktopE2EWorkflow(owner, repoName, sha, sha, instanceDetailsJSON, runType, nightly) + // Pass the tracking key so the workflow_run completed handler can clean up by + // direct key lookup rather than SHA suffix matching (immune to new commits during + // the ~30 min instance-creation window). + dispatchErr = s.dispatchDesktopE2EWorkflow(owner, repoName, branch, sha, instanceDetailsJSON, runType, key, nightly) } else { if len(instances) < 3 { logger.Errorf("Expected 3 mobile instances, got %d", len(instances)) @@ -212,9 +235,8 @@ func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha, triggerEv s.destroyE2EInstances(instances, logger) return } - // Dispatch to the exact SHA so workflow_run completed event matches the tracking key. - dispatchErr = s.dispatchMobileE2EWorkflow(owner, repoName, sha, sha, - instances[0].URL, instances[1].URL, instances[2].URL, "both", runType) + dispatchErr = s.dispatchMobileE2EWorkflow(owner, repoName, branch, sha, + instances[0].URL, instances[1].URL, instances[2].URL, "both", runType, key) } if dispatchErr != nil { @@ -357,9 +379,9 @@ func (s *Server) handleCMTWithServerVersions(repoOwner, repoName, instanceType, return } - // Dispatch to the exact SHA so the completed workflow_run event carries the same - // head_sha, allowing findAndDestroyInstancesBySHA to match the tracking key. - if err := s.dispatchCMTWorkflow(repoOwner, repoName, sha, branch, cmtMatrixJSON, instanceType, runID, logger); err != nil { + // Pass the tracking key so the workflow_run completed handler can clean up by + // direct key lookup rather than SHA suffix matching. + if err := s.dispatchCMTWorkflow(repoOwner, repoName, sha, branch, cmtMatrixJSON, instanceType, key, runID, logger); err != nil { logger.WithError(err).Error("Failed to dispatch compatibility-matrix-testing.yml") s.e2eInstancesLock.Lock() delete(s.e2eInstances, key) @@ -472,17 +494,19 @@ func buildMobileCMTMatrixJSON(versions []string, instances []*E2EInstance) (stri } // dispatchCMTWorkflow dispatches compatibility-matrix-testing.yml with the populated -// CMT_MATRIX JSON. Dispatches with ref=sha so the resulting workflow_run.head_sha matches -// the tracking key suffix, enabling sha-based cleanup on completion. +// CMT_MATRIX JSON. trackingKey is the s.e2eInstances map key for this run; it is +// embedded as "mw_tracking_key" in the workflow inputs so the workflow_run completed +// handler can do a direct key lookup instead of fragile SHA suffix matching. // runID is the CMT provisioner workflow run ID, passed as cmt_run_id so the test workflow // can call back to Matterwick for instance cleanup. -func (s *Server) dispatchCMTWorkflow(repoOwner, repoName, sha, branch, cmtMatrixJSON, instanceType string, runID int64, logger logrus.FieldLogger) error { +func (s *Server) dispatchCMTWorkflow(repoOwner, repoName, sha, branch, cmtMatrixJSON, instanceType, trackingKey string, runID int64, logger logrus.FieldLogger) error { ctx := context.Background() client := newGithubClient(s.Config.GithubAccessToken) workflowInputs := map[string]interface{}{ - "CMT_MATRIX": cmtMatrixJSON, - "cmt_run_id": fmt.Sprintf("%d", runID), + "CMT_MATRIX": cmtMatrixJSON, + "cmt_run_id": fmt.Sprintf("%d", runID), + "mw_tracking_key": trackingKey, } if instanceType == "desktop" { workflowInputs["DESKTOP_VERSION"] = branch @@ -491,14 +515,15 @@ func (s *Server) dispatchCMTWorkflow(repoOwner, repoName, sha, branch, cmtMatrix } logger.WithFields(logrus.Fields{ - "ref": sha, + "ref": branch, "instanceType": instanceType, }).Debug("Dispatching compatibility-matrix-testing.yml") + // GitHub workflow_dispatch requires a branch or tag name as ref, not a commit SHA. req, err := client.NewRequest("POST", fmt.Sprintf("/repos/%s/%s/actions/workflows/compatibility-matrix-testing.yml/dispatches", repoOwner, repoName), map[string]interface{}{ - "ref": sha, + "ref": branch, "inputs": workflowInputs, }) if err != nil {