Skip to content
7 changes: 7 additions & 0 deletions server/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ type MatterwickConfig struct {
E2ELabel string
E2EMobileIOSLabel string
E2EMobileAndroidLabel string
E2EResetServersLabel string
E2EUsername string
E2EPassword string
E2EServerVersion string
Expand All @@ -116,6 +117,12 @@ type MatterwickConfig struct {
E2EReleasePatternPrefix string
E2ENightlyTriggerWorkflowName string // workflow name (name: field) of the nightly trigger workflow
E2ETestWorkflowNames []string // workflow names of the actual test workflows (for completion-based cleanup)
// E2EInstanceMaxAge is the minimum age (in hours) a non-PR E2E instance must reach
// before the periodic orphan-cleanup scan will delete it. This prevents the scan
// from destroying instances that are still being used by a currently-running test.
// Set to the longest expected E2E run duration plus a small buffer.
// Default (0): 3 hours.
E2EInstanceMaxAge int
}

func findConfigFile(fileName string) string {
Expand Down
120 changes: 92 additions & 28 deletions server/e2e_tests.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,11 @@ type E2EInstance struct {
ServerVersion string `json:"server_version"`
}

// e2eUniqueSuffix returns an 8-character hex timestamp for instance name uniqueness.
// e2eUniqueSuffix returns an 8-character random hex suffix for instance name uniqueness.
// Uses cloudModel.NewID (crypto/rand-based UUID) truncated to 8 chars so that
// concurrent calls always produce distinct values regardless of clock resolution.
func e2eUniqueSuffix() string {
return fmt.Sprintf("%08x", time.Now().Unix())
return cloudModel.NewID()[:8]
}

// sanitizeForDNS lowercases and replaces non-DNS characters with hyphens.
Expand Down Expand Up @@ -94,6 +96,33 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) {

key := fmt.Sprintf("%s-pr-%d", pr.RepoName, pr.Number)

// Snapshot the cleanup generation before provisioning begins. If handleE2ECleanup
// fires while the ~30 min creation is in flight, it increments this counter.
// We re-check below before storing instances so we never write stale entries.
s.e2ePRCleanupGenerationLock.Lock()
startGeneration := s.e2ePRCleanupGeneration[key]
s.e2ePRCleanupGenerationLock.Unlock()

// Guard against duplicate webhook deliveries. The in-progress key includes
// the test platform so that a second mobile label with a *different* platform
// (e.g. E2E/Run-Android while E2E/Run-iOS is provisioning) is not incorrectly
// dropped — it will reuse the in-flight instances once they are stored, or
// create its own if they are not yet available.
inProgressKey := fmt.Sprintf("%s-%s", key, testPlatform)
s.e2eInProgressLock.Lock()
if s.e2eInProgress[inProgressKey] {
s.e2eInProgressLock.Unlock()
logger.Warn("E2E instance creation already in progress for this PR and platform, skipping duplicate request")
return
}
s.e2eInProgress[inProgressKey] = true
s.e2eInProgressLock.Unlock()
defer func() {
s.e2eInProgressLock.Lock()
delete(s.e2eInProgress, inProgressKey)
s.e2eInProgressLock.Unlock()
}()

// 1. Reuse existing in-memory instances (servers stay alive between label toggles).
s.e2eInstancesLock.Lock()
existingInstances := s.e2eInstances[key]
Expand Down Expand Up @@ -152,6 +181,19 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) {
return
}

// Check whether E2EResetServersLabel was applied while provisioning was in flight.
// If the cleanup generation advanced, handleE2ECleanup already deleted the cloud
// installations; storing them here would put stale, deleted instances into the
// tracking map and dispatch a workflow against non-existent servers.
s.e2ePRCleanupGenerationLock.Lock()
resetDuringProvisioning := s.e2ePRCleanupGeneration[key] != startGeneration
s.e2ePRCleanupGenerationLock.Unlock()
if resetDuringProvisioning {
logger.Warn("E2E reset was requested during provisioning; discarding freshly created instances")
s.destroyE2EInstances(instances, logger)
return
}

s.e2eInstancesLock.Lock()
s.e2eInstances[key] = instances
s.e2eInstancesLock.Unlock()
Expand Down Expand Up @@ -575,8 +617,18 @@ func (s *Server) handleE2ECleanup(pr *model.PullRequest) {
})
logger.Info("Handling E2E cleanup request")

// Fast path: in-memory map
key := fmt.Sprintf("%s-pr-%d", pr.RepoName, pr.Number)

// Increment the cleanup generation counter *before* destroying anything.
// handleE2ETestRequest captures this value before provisioning starts and
// checks it again after the ~30 min creation window. If the counter advanced,
// provisioning discards its result instead of writing stale instances to the
// tracking map and dispatching a workflow against already-deleted servers.
s.e2ePRCleanupGenerationLock.Lock()
s.e2ePRCleanupGeneration[key]++
s.e2ePRCleanupGenerationLock.Unlock()

// Fast path: in-memory map
s.e2eInstancesLock.Lock()
instances := s.e2eInstances[key]
delete(s.e2eInstances, key)
Expand Down Expand Up @@ -639,21 +691,22 @@ func (s *Server) cleanupOrphanedE2EInstances(pr *model.PullRequest, logger logru
}
}

// cleanupNonPRE2EInstancesOnStartup destroys stale non-PR E2E instances left over from a
// previous matterwick run. When matterwick restarts, the in-memory tracking map is wiped,
// making push/nightly/CMT instances untrackable — the workflow_run(completed) cleanup path
// will find nothing. A 8-hour age threshold is used so that instances still being actively
// used by tests that started before the restart are NOT destroyed mid-run.
//
// Reasoning: E2E tests (nightly, push, CMT) take at most a few hours. Any non-PR instance
// older than 8 hours whose tracking was lost must be orphaned.
//
// e2eInstanceMaxAge returns the configured maximum age for non-PR E2E instances before
// they are considered orphaned and eligible for deletion by the periodic cleanup scan.
// Falls back to 3 hours when the config value is 0 (unset).
func (s *Server) e2eInstanceMaxAge() time.Duration {
if s.Config.E2EInstanceMaxAge > 0 {
return time.Duration(s.Config.E2EInstanceMaxAge) * time.Hour
}
return 3 * time.Hour
}

// PR instances (identified by "-pr-" in their OwnerID) are always skipped — handleE2ECleanup
// on PR close, which includes DNS orphan cleanup, manages their lifecycle.
func (s *Server) cleanupNonPRE2EInstancesOnStartup() {
const maxAge = 8 * time.Hour
logger := s.Logger.WithField("type", "startup_e2e_cleanup")
logger.WithField("max_age_hours", 8).Info("Scanning for stale non-PR E2E instances from previous matterwick run")
// on PR close manages their lifecycle via cloud-API orphan scan.
func (s *Server) cleanupStaleNonPRE2EInstances() {
maxAge := s.e2eInstanceMaxAge()
logger := s.Logger.WithField("type", "periodic_e2e_cleanup")
logger.WithField("max_age_hours", maxAge.Hours()).Info("Scanning for stale non-PR E2E instances")

cutoffMs := time.Now().Add(-maxAge).UnixMilli()

Expand All @@ -664,15 +717,15 @@ func (s *Server) cleanupNonPRE2EInstancesOnStartup() {
Paging: cloudModel.AllPagesNotDeleted(),
})
if err != nil {
logger.WithError(err).Errorf("Failed to query %s instances on startup", instanceType)
logger.WithError(err).Errorf("Failed to query %s instances", instanceType)
continue
}

for _, inst := range installations {
// Guard against a nil embedded Installation — must come first, all other
// field accesses (OwnerID, CreateAt, State, ID) are on the embedded struct.
if inst.Installation == nil {
logger.Warn("Skipping instance with nil Installation pointer in startup cleanup")
logger.Warn("Skipping instance with nil Installation pointer in cleanup scan")
continue
}

Expand All @@ -682,13 +735,12 @@ func (s *Server) cleanupNonPRE2EInstancesOnStartup() {
continue
}

// Skip instances created within the last 8 hours — tests may still be running.
// If matterwick restarted mid-test, destroying active test instances would break them.
// Skip instances younger than maxAge — a test may still be using them.
if inst.CreateAt > cutoffMs {
logger.WithFields(logrus.Fields{
"installation_id": inst.ID,
"owner_id": inst.OwnerID,
}).Debug("Skipping non-PR instance younger than 8h (may still be in use)")
}).Debug("Skipping non-PR instance younger than max age (may still be in use)")
continue
}

Expand All @@ -707,14 +759,14 @@ func (s *Server) cleanupNonPRE2EInstancesOnStartup() {
"owner_id": inst.OwnerID,
"state": inst.State,
})
instLogger.Warn("Destroying stale non-PR E2E instance (older than 8h) left from previous matterwick run")
instLogger.Warn("Destroying stale non-PR E2E instance")
if err := s.CloudClient.DeleteInstallation(inst.ID); err != nil {
instLogger.WithError(err).Error("Failed to destroy stale non-PR E2E instance")
}
}
}

logger.Info("Startup non-PR E2E instance cleanup complete")
logger.Info("Non-PR E2E instance cleanup scan complete")
}

// resolveE2EServerVersion returns the Mattermost server version to use for E2E instances.
Expand Down Expand Up @@ -1001,8 +1053,11 @@ func (s *Server) buildInstanceDetailsJSON(instances []*E2EInstance) (string, err
return string(jsonBytes), nil
}

// dispatchDesktopE2EWorkflow triggers the desktop E2E workflow via GitHub Actions API
func (s *Server) dispatchDesktopE2EWorkflow(repoOwner, repoName, ref, sha, instanceDetailsJSON, runType string, nightly bool) error {
// dispatchDesktopE2EWorkflow triggers the desktop E2E workflow via GitHub Actions API.
// trackingKey is the s.e2eInstances map key for this run; when non-empty it is passed
// as the "mw_tracking_key" workflow input so the workflow_run completed handler can do
// a direct key lookup instead of fragile SHA suffix matching.
func (s *Server) dispatchDesktopE2EWorkflow(repoOwner, repoName, ref, sha, instanceDetailsJSON, runType, trackingKey string, nightly bool) error {
ctx := context.Background()
client := newGithubClient(s.Config.GithubAccessToken)

Expand Down Expand Up @@ -1035,6 +1090,9 @@ func (s *Server) dispatchDesktopE2EWorkflow(repoOwner, repoName, ref, sha, insta
"run_type": runType,
"nightly": fmt.Sprintf("%t", nightly),
}
if trackingKey != "" {
workflowInputs["mw_tracking_key"] = trackingKey
}

// Use REST API to trigger workflow dispatch (v32 go-github compatibility)
req, err := client.NewRequest("POST",
Expand Down Expand Up @@ -1063,8 +1121,11 @@ func (s *Server) dispatchDesktopE2EWorkflow(repoOwner, repoName, ref, sha, insta
return nil
}

// dispatchMobileE2EWorkflow triggers the mobile E2E workflow via GitHub Actions API
func (s *Server) dispatchMobileE2EWorkflow(repoOwner, repoName, ref, sha, site1URL, site2URL, site3URL, platform, runType string) error {
// dispatchMobileE2EWorkflow triggers the mobile E2E workflow via GitHub Actions API.
// trackingKey is the s.e2eInstances map key for this run; when non-empty it is passed
// as the "mw_tracking_key" workflow input so the workflow_run completed handler can do
// a direct key lookup instead of fragile SHA suffix matching.
func (s *Server) dispatchMobileE2EWorkflow(repoOwner, repoName, ref, sha, site1URL, site2URL, site3URL, platform, runType, trackingKey string) error {
ctx := context.Background()
client := newGithubClient(s.Config.GithubAccessToken)

Expand All @@ -1082,6 +1143,9 @@ func (s *Server) dispatchMobileE2EWorkflow(repoOwner, repoName, ref, sha, site1U
"PLATFORM": platform,
"run_type": runType,
}
if trackingKey != "" {
workflowInputs["mw_tracking_key"] = trackingKey
}

// Use REST API to trigger workflow dispatch (v32 go-github compatibility)
req, err := client.NewRequest("POST",
Expand Down
10 changes: 8 additions & 2 deletions server/pull_request.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ func (s *Server) handlePullRequestEvent(event *github.PullRequestEvent) {
return
}

if label == s.Config.E2EResetServersLabel {
logger.WithField("label", label).Info("PR received E2E reset-servers label, destroying existing servers")
go s.handleE2ECleanup(pr)
return
Comment thread
yasserfaraazkhan marked this conversation as resolved.
}

if s.isSpinWickLabel(label) {
logger.WithField("label", label).Info("PR received SpinWick label")
switch *event.Label.Name {
Expand Down Expand Up @@ -160,14 +166,14 @@ func (s *Server) removeOldComments(comments []*github.IssueComment, pr *model.Pu
}
}

// isE2ELabel checks if a label is an E2E test label (desktop or mobile)
// isE2ELabel checks if a label is an E2E test label (desktop or mobile).
func (s *Server) isE2ELabel(label string) bool {
return label == s.Config.E2ELabel ||
label == s.Config.E2EMobileIOSLabel ||
label == s.Config.E2EMobileAndroidLabel
}

// extractPlatformFromLabel determines the platform (ios/android/both) from the label
// extractPlatformFromLabel determines the platform (ios/android/both) from the label.
func (s *Server) extractPlatformFromLabel(label string) string {
switch label {
case s.Config.E2EMobileIOSLabel:
Expand Down
38 changes: 21 additions & 17 deletions server/push_events.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,13 @@ func (s *Server) handlePushEventE2E(event *github.PushEvent, branch string, vers
instanceType = "mobile"
}

// Validate SHA before provisioning — an empty SHA would produce a malformed
// tracking key and send an empty MOBILE_VERSION to the test workflow.
if sha == "" {
logger.Error("Push event has no commit SHA, skipping E2E dispatch")
return
}

logger.WithField("instanceType", instanceType).Info("Creating E2E instances for push event")

// Create instances based on repo type
Expand All @@ -122,8 +129,9 @@ func (s *Server) handlePushEventE2E(event *github.PushEvent, branch string, vers
s.e2eInstances[key] = instances
s.e2eInstancesLock.Unlock()

// Trigger the appropriate E2E workflow
err = s.triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, sha, instances)
// Trigger the appropriate E2E workflow, passing the tracking key so it reaches
// the workflow inputs as mw_tracking_key for reliable cleanup on completion.
err = s.triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, sha, key, instances)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
if err != nil {
logger.WithError(err).Error("Failed to trigger E2E workflow")
// Remove from tracking and destroy instances on dispatch failure
Expand Down Expand Up @@ -233,8 +241,10 @@ func getRunnerForPlatform(platform string) string {
}
}

// triggerE2EWorkflowForPushEvent triggers the E2E workflow for a push event
func (s *Server) triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, sha string, instances []*E2EInstance) error {
// triggerE2EWorkflowForPushEvent triggers the E2E workflow for a push event.
// trackingKey is threaded through to the dispatch call so it appears as mw_tracking_key
// in the workflow inputs, enabling direct key-based cleanup on completion.
func (s *Server) triggerE2EWorkflowForPushEvent(repoName, instanceType, branch, sha, trackingKey string, instances []*E2EInstance) error {
logger := s.Logger.WithFields(logrus.Fields{
"repo": repoName,
"instanceType": instanceType,
Expand All @@ -250,14 +260,14 @@ func (s *Server) triggerE2EWorkflowForPushEvent(repoName, instanceType, branch,
}

if instanceType == "desktop" {
return s.triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, instances)
return s.triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, trackingKey, instances)
}

return s.triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, instances)
return s.triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, trackingKey, instances)
}

// triggerDesktopE2EWorkflowForPushEvent triggers the desktop E2E workflow
func (s *Server) triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha string, instances []*E2EInstance) error {
func (s *Server) triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, trackingKey string, instances []*E2EInstance) error {
logger := s.Logger.WithFields(logrus.Fields{
"repo": repoName,
"branch": branch,
Expand All @@ -277,15 +287,11 @@ func (s *Server) triggerDesktopE2EWorkflowForPushEvent(repoOwner, repoName, bran
runType = "RELEASE"
}

// Dispatch to the exact commit SHA so the workflow_run completed event carries the
// same head_sha we stored in the tracking key ({repo}-push-{branch}-{sha}).
// Using the branch ref risks a head_sha mismatch if another commit lands during
// the ~30 min instance-creation window.
return s.dispatchDesktopE2EWorkflow(repoOwner, repoName, sha, sha, instanceDetailsJSON, runType, false)
return s.dispatchDesktopE2EWorkflow(repoOwner, repoName, branch, sha, instanceDetailsJSON, runType, trackingKey, false)
}

// triggerMobileE2EWorkflowForPushEvent triggers the mobile E2E workflow (e2e-detox-pr.yml)
func (s *Server) triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha string, instances []*E2EInstance) error {
func (s *Server) triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branch, sha, trackingKey string, instances []*E2EInstance) error {
logger := s.Logger.WithFields(logrus.Fields{
"repo": repoName,
"branch": branch,
Expand All @@ -307,13 +313,11 @@ func (s *Server) triggerMobileE2EWorkflowForPushEvent(repoOwner, repoName, branc
runType = "RELEASE"
}

// Dispatch to the exact commit SHA (not branch) so the workflow_run completed event
// carries the same head_sha stored in the tracking key ({repo}-push-{branch}-{sha}).
return s.dispatchMobileE2EWorkflow(
repoOwner, repoName, sha, sha,
repoOwner, repoName, branch, sha,
instances[0].URL, instances[1].URL, instances[2].URL,
"both", // Push events (release/master) test both platforms
runType,
runType, trackingKey,
)
}

Loading
Loading