diff --git a/server/e2e_tests.go b/server/e2e_tests.go
index 58eb08d..6db2d0f 100644
--- a/server/e2e_tests.go
+++ b/server/e2e_tests.go
@@ -10,6 +10,7 @@ import (
 	"net/url"
 	"os"
 	"strings"
+	"sync"
 	"time"
 
 	"github.com/google/go-github/v32/github"
@@ -71,43 +72,19 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) {
 	})
 	logger.Info("Handling E2E test request")
 
-	// Check if there's already an E2E run in progress for this PR
-	// If yes, cancel it before starting a new one
-	key := fmt.Sprintf("%s-pr-%d", pr.RepoName, pr.Number)
-	s.e2eInstancesLock.Lock()
-	existingInstances, hasExisting := s.e2eInstances[key]
-	if hasExisting {
-		logger.WithField("existingInstances", len(existingInstances)).Info("Found existing E2E run, canceling it")
-		// Remove from tracking immediately to prevent race conditions
-		delete(s.e2eInstances, key)
-		s.e2eInstancesLock.Unlock()
-
-		// Destroy old instances in background
-		go s.destroyE2EInstances(existingInstances, logger)
-
-		// Also attempt to cancel the GitHub workflow run
-		go s.cancelPRWorkflowRuns(pr, logger)
-	} else {
-		s.e2eInstancesLock.Unlock()
-	}
-
-	// Determine instance type based on repository
+	// Determine instance type and platforms first — needed for both reuse lookup and creation.
 	var instanceType string
 	var platforms []string
 	var testPlatform string // For mobile: which OS to test (ios/android/both). For desktop: unused (tests all OS platforms)
 
 	if strings.Contains(pr.RepoName, "desktop") {
 		instanceType = "desktop"
-		// Desktop: platforms = OS runners (linux/macos/windows)
-		// Desktop tests run on all OS platforms automatically
 		platforms = []string{"linux", "macos", "windows"}
-		testPlatform = "all" // Desktop always tests all OS platforms (linux/macos/windows)
+		testPlatform = "all"
 	} else if strings.Contains(pr.RepoName, "mobile") {
 		instanceType = "mobile"
-		// Mobile: platforms = server instances (site-1/site-2/site-3)
 		// Always create all 3 mobile instances (workflow expects SITE_1/2/3_URL).
 		platforms = []string{"site-1", "site-2", "site-3"}
-		// Mobile: testPlatform = which mobile OS to test (ios/android/both)
 		testPlatform = s.extractPlatformFromLabel(label)
 		logger.WithField("testPlatform", testPlatform).Info("Detected mobile test platform from label (ios/android/both)")
 	} else {
@@ -115,7 +92,40 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) {
 		return
 	}
 
-	// Create multiple instances
+	key := fmt.Sprintf("%s-pr-%d", pr.RepoName, pr.Number)
+
+	// 1. Reuse existing in-memory instances (servers stay alive between label toggles).
+	s.e2eInstancesLock.Lock()
+	existingInstances := s.e2eInstances[key]
+	s.e2eInstancesLock.Unlock()
+
+	if len(existingInstances) > 0 {
+		logger.WithField("instances", len(existingInstances)).Info("Reusing existing in-memory E2E instances")
+		s.cancelPRWorkflowRuns(pr, logger)
+		s.wakeUpHibernatingInstances(existingInstances, logger)
+		if err := s.triggerE2EWorkflow(pr, existingInstances, instanceType, testPlatform); err != nil {
+			logger.WithError(err).Error("Failed to trigger E2E workflow with existing instances")
+			s.postE2EErrorComment(pr, fmt.Sprintf("Failed to trigger E2E workflow: %v", err))
+		}
+		return
+	}
+
+	// 2. Check cloud API for instances that survived a matterwick restart.
+	if cloudInstances, err := s.findExistingE2EInstancesInCloud(pr, instanceType, platforms); err == nil && len(cloudInstances) == len(platforms) {
+		logger.WithField("instances", len(cloudInstances)).Info("Reusing existing cloud E2E instances")
+		s.cancelPRWorkflowRuns(pr, logger)
+		s.wakeUpHibernatingInstances(cloudInstances, logger)
+		s.e2eInstancesLock.Lock()
+		s.e2eInstances[key] = cloudInstances
+		s.e2eInstancesLock.Unlock()
+		if err := s.triggerE2EWorkflow(pr, cloudInstances, instanceType, testPlatform); err != nil {
+			logger.WithError(err).Error("Failed to trigger E2E workflow with cloud instances")
+			s.postE2EErrorComment(pr, fmt.Sprintf("Failed to trigger E2E workflow: %v", err))
+		}
+		return
+	}
+
+	// 3. No existing instances — create fresh ones.
 	instances, err := s.createMultipleE2EInstances(pr, instanceType, platforms)
 	if err != nil {
 		logger.WithError(err).Error("Failed to create E2E instances")
@@ -129,23 +139,32 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) {
 		return
 	}
 
-	// Store instances for later cleanup (reuse key variable from above)
+	// Instance creation takes ~30 min. Check if the PR was closed during that window.
+	// If so, destroy the freshly created instances — no further cleanup events will fire
+	// for a closed PR, so storing them would leak them permanently.
+	prInfo, _, prErr := newGithubClient(s.Config.GithubAccessToken).PullRequests.Get(
+		context.Background(), pr.RepoOwner, pr.RepoName, pr.Number)
+	if prErr != nil {
+		logger.WithError(prErr).Warn("Failed to check PR state after instance creation; proceeding")
+	} else if prInfo.GetState() == "closed" {
+		logger.Warn("PR was closed during E2E instance creation; destroying instances without tracking")
+		s.destroyE2EInstances(instances, logger)
+		return
+	}
+
 	s.e2eInstancesLock.Lock()
 	s.e2eInstances[key] = instances
 	s.e2eInstancesLock.Unlock()
 
 	logger.WithField("instances", len(instances)).Info("Successfully created E2E instances")
 
-	// Trigger the appropriate workflow
-	err = s.triggerE2EWorkflow(pr, instances, instanceType, testPlatform) // Pass testPlatform: "all" for desktop, "ios"/"android"/"both" for mobile
-	if err != nil {
+	if err = s.triggerE2EWorkflow(pr, instances, instanceType, testPlatform); err != nil {
 		logger.WithError(err).Error("Failed to trigger E2E workflow")
 		s.postE2EErrorComment(pr, fmt.Sprintf("Failed to trigger E2E workflow: %v", err))
-		// Remove instances from tracking map before cleanup to avoid double-destroy on later cleanup.
+		// Remove from tracking before cleanup to avoid double-destroy on later cleanup.
 		s.e2eInstancesLock.Lock()
 		delete(s.e2eInstances, key)
 		s.e2eInstancesLock.Unlock()
-		// Attempt cleanup on workflow trigger failure
 		s.destroyE2EInstances(instances, logger)
 		return
 	}
@@ -153,14 +172,14 @@ func (s *Server) handleE2ETestRequest(pr *model.PullRequest, label string) {
 	logger.Info("Successfully triggered E2E workflow")
 }
 
-// createMultipleE2EInstances creates multiple instances for E2E testing
+// createMultipleE2EInstances creates all platform instances in parallel.
+// Results are returned in the same order as platforms[] so that callers can rely on
+// index-based platform assignment (e.g. instances[0] = site-1 for mobile).
 func (s *Server) createMultipleE2EInstances(pr *model.PullRequest, instanceType string, platforms []string) ([]*E2EInstance, error) {
 	if len(platforms) == 0 {
 		return nil, fmt.Errorf("no platforms specified")
 	}
 
-	var instances []*E2EInstance
-
 	logger := s.Logger.WithFields(logrus.Fields{
 		"repo":      pr.RepoName,
 		"pr":        pr.Number,
@@ -168,58 +187,93 @@ func (s *Server) createMultipleE2EInstances(pr *model.PullRequest, instanceType
 		"platforms": len(platforms),
 	})
 
-	// Create username and password for this E2E test set
-	var username, password string
-	username = s.Config.E2EUsername
-
-	// Get password from environment or generate one
-	password = s.getE2EPassword(instanceType)
-
+	username := s.Config.E2EUsername
+	password := s.getE2EPassword(instanceType)
 	// Name format: {type}-pr-{pr}-{platform}-{hex6}
 	uid := e2eUniqueSuffix()
 
-	for _, platform := range platforms {
-		instanceName := e2eInstanceName(
-			s.Config.DNSNameTestServer,
-			instanceType, fmt.Sprintf("pr-%d", pr.Number), platform, uid,
-		)
+	// Shared cancellable context: the first goroutine to fail cancels the rest so they
+	// exit their polling loop within one sleep interval (30s) instead of waiting up to 30min.
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
 
-		logger.WithField("instance", instanceName).Info("Creating E2E instance")
+	type result struct {
+		instance *E2EInstance
+		err      error
+	}
+	// Pre-allocate by index so each goroutine writes to its own slot — no mutex needed.
+	results := make([]result, len(platforms))
+	var wg sync.WaitGroup
+
+	for i, platform := range platforms {
+		wg.Add(1)
+		go func(idx int, platform string) {
+			defer wg.Done()
+			instanceName := e2eInstanceName(
+				s.Config.DNSNameTestServer,
+				instanceType, fmt.Sprintf("pr-%d", pr.Number), platform, uid,
+			)
+			logger.WithField("instance", instanceName).Info("Creating E2E instance")
+			inst, err := s.createCloudInstallation(ctx, instanceName, s.Config.E2EServerVersion, username, password, instanceType, logger)
+			if err != nil {
+				cancel() // signal sibling goroutines to stop waiting
+				results[idx] = result{err: err}
+				return
+			}
+			inst.Platform = platform
+			if instanceType == "desktop" {
+				inst.Runner = getRunnerForPlatform(platform)
+			}
+			results[idx] = result{instance: inst}
+		}(i, platform)
+	}
 
-		// Create the installation
-		instance, err := s.createCloudInstallation(instanceName, s.Config.E2EServerVersion, username, password, instanceType, logger)
-		if err != nil {
-			logger.WithError(err).Error("Failed to create cloud installation")
-			// Cleanup already created instances on failure
-			s.destroyE2EInstances(instances, logger)
-			return nil, err
-		}
+	wg.Wait()
 
-		instance.Platform = platform
-		if instanceType == "desktop" {
-			// Assign appropriate runner for each platform
-			instance.Runner = getRunnerForPlatform(platform)
+	// Collect results in platforms[] order. On any error, destroy all that succeeded.
+	var instances []*E2EInstance
+	var firstErr error
+	for _, r := range results {
+		if r.err != nil {
+			if firstErr == nil {
+				firstErr = r.err
+			}
+		} else {
+			instances = append(instances, r.instance)
 		}
+	}
 
-		instances = append(instances, instance)
-		logger.WithField("instance", instanceName).Info("Successfully created E2E instance")
+	if firstErr != nil {
+		s.destroyE2EInstances(instances, logger)
+		return nil, firstErr
 	}
 
 	return instances, nil
 }
 
-// createCloudInstallation creates a single installation via provisioner API
-func (s *Server) createCloudInstallation(name, version, username, password, instanceType string, logger logrus.FieldLogger) (*E2EInstance, error) {
-	// Create installation request
-	envVars := cloudModel.EnvVarMap{
-		"MM_SERVICESETTINGS_ENABLETUTORIAL":       cloudModel.EnvVar{Value: "false"},
-		"MM_SERVICESETTINGS_ENABLEONBOARDINGFLOW": cloudModel.EnvVar{Value: "false"},
-		"MM_SERVICEENVIRONMENT":                   cloudModel.EnvVar{Value: "test"},
+// createCloudInstallation creates a single installation via provisioner API.
+// ctx is used to cancel the polling wait so that parallel callers can abort early when a
+// sibling goroutine fails, instead of waiting up to 30 minutes per polling interval.
+func (s *Server) createCloudInstallation(ctx context.Context, name, version, username, password, instanceType string, logger logrus.FieldLogger) (*E2EInstance, error) {
+	if err := ctx.Err(); err != nil {
+		return nil, fmt.Errorf("installation creation cancelled before request: %w", err)
 	}
 
-	// Enable automatic replies for mobile E2E tests
-	if instanceType == "mobile" {
-		envVars["MM_TEAMSETTINGS_EXPERIMENTALENABLEAUTOMATICREPLIES"] = cloudModel.EnvVar{Value: "true"}
+	// Create installation request
+	envVars := cloudModel.EnvVarMap{
+		"MM_SERVICESETTINGS_ENABLETUTORIAL":                cloudModel.EnvVar{Value: "false"},
+		"MM_SERVICESETTINGS_ENABLEONBOARDINGFLOW":          cloudModel.EnvVar{Value: "false"},
+		"MM_SERVICESETTINGS_ENABLEUSERTYPINGMESSAGES":      cloudModel.EnvVar{Value: "false"},
+		"MM_SERVICESETTINGS_SESSIONLENGTHMOBILEINHOURS":    cloudModel.EnvVar{Value: "5000"},
+		"MM_SERVICESETTINGS_SESSIONCACHEINMINUTES":         cloudModel.EnvVar{Value: "180"},
+		"MM_SERVICEENVIRONMENT":                            cloudModel.EnvVar{Value: "test"},
+		"MM_RATELIMITSETTINGS_ENABLE":                         cloudModel.EnvVar{Value: "true"},
+		"MM_RATELIMITSETTINGS_PERSEC":                         cloudModel.EnvVar{Value: "3000"},
+		"MM_RATELIMITSETTINGS_MAXBURST":                       cloudModel.EnvVar{Value: "5000"},
+		"MM_RATELIMITSETTINGS_MEMORYSTORESIZE":                cloudModel.EnvVar{Value: "10000"},
+		"MM_RATELIMITSETTINGS_VARYBYREMOTEADDR":               cloudModel.EnvVar{Value: "false"},
+		"MM_RATELIMITSETTINGS_VARYBYUSER":                     cloudModel.EnvVar{Value: "false"},
+		"MM_TEAMSETTINGS_EXPERIMENTALENABLEAUTOMATICREPLIES":  cloudModel.EnvVar{Value: "true"},
 	}
 
 	installationRequest := &cloudModel.CreateInstallationRequest{
@@ -244,6 +298,17 @@ func (s *Server) createCloudInstallation(name, version, username, password, inst
 		return nil, fmt.Errorf("failed to create installation: %w", err)
 	}
 
+	// cleanupCreatedInstallation is a best-effort cleanup helper used on all failure paths after
+	// CreateInstallation succeeds. Without it, the cloud installation would be permanently
+	// orphaned because it has not yet been added to the in-memory tracking map.
+	// It deletes the installation, logs any deletion error, and returns cause unchanged.
+	cleanupCreatedInstallation := func(cause error) error {
+		if delErr := s.CloudClient.DeleteInstallation(installation.ID); delErr != nil {
+			logger.WithError(delErr).WithField("installation_id", installation.ID).Error("Failed to clean up partially created installation")
+		}
+		return cause
+	}
+
 	logger.WithField("installation_id", installation.ID).Info("Installation created, waiting for stable state")
 
 	// Wait for installation to be stable using polling with timeout
@@ -251,7 +316,7 @@ func (s *Server) createCloudInstallation(name, version, username, password, inst
 	for {
 		inst, err := s.CloudClient.GetInstallation(installation.ID, nil)
 		if err != nil {
-			return nil, fmt.Errorf("failed to get installation status: %w", err)
+			return nil, cleanupCreatedInstallation(fmt.Errorf("failed to get installation status: %w", err))
 		}
 
 		if inst.State == cloudModel.InstallationStateStable || inst.State == cloudModel.InstallationStateHibernating {
@@ -260,17 +325,27 @@ func (s *Server) createCloudInstallation(name, version, username, password, inst
 		}
 
 		if time.Now().After(timeout) {
-			return nil, fmt.Errorf("timeout waiting for installation to become stable")
+			return nil, cleanupCreatedInstallation(fmt.Errorf("timeout waiting for installation to become stable"))
 		}
 
-		time.Sleep(30 * time.Second)
+		// Context-aware sleep: wake immediately if a sibling goroutine failed and cancelled ctx.
+		select {
+		case <-ctx.Done():
+			return nil, cleanupCreatedInstallation(fmt.Errorf("installation wait cancelled: %w", ctx.Err()))
+		case <-time.After(30 * time.Second):
+		}
+	}
+
+	// Check cancellation before the 10-minute initialization phase (DNS + ping + user setup).
+	if err := ctx.Err(); err != nil {
+		return nil, cleanupCreatedInstallation(fmt.Errorf("installation creation cancelled before initialization: %w", err))
 	}
 
 	// Initialize Mattermost server with provided credentials
 	spinwickURL := fmt.Sprintf("https://%s", cloudtools.GetInstallationDNSFromDNSRecords(installation))
 	err = s.initializeMattermostE2EServer(spinwickURL, username, password, logger)
 	if err != nil {
-		return nil, fmt.Errorf("failed to initialize Mattermost server: %w", err)
+		return nil, cleanupCreatedInstallation(fmt.Errorf("failed to initialize Mattermost server: %w", err))
 	}
 
 	return &E2EInstance{
@@ -490,7 +565,7 @@ func (s *Server) triggerMobileE2EWorkflow(ctx context.Context, client *github.Cl
 	return nil
 }
 
-// handleE2ECleanup destroys all E2E instances for a PR
+// handleE2ECleanup destroys tracked E2E instances, then queries the cloud API by DNS pattern to catch orphans.
 func (s *Server) handleE2ECleanup(pr *model.PullRequest) {
 	logger := s.Logger.WithFields(logrus.Fields{
 		"repo": pr.RepoName,
@@ -499,20 +574,146 @@ func (s *Server) handleE2ECleanup(pr *model.PullRequest) {
 	})
 	logger.Info("Handling E2E cleanup request")
 
-	// Retrieve and remove instances from tracking
+	// Fast path: in-memory map
 	key := fmt.Sprintf("%s-pr-%d", pr.RepoName, pr.Number)
 	s.e2eInstancesLock.Lock()
 	instances := s.e2eInstances[key]
 	delete(s.e2eInstances, key)
 	s.e2eInstancesLock.Unlock()
 
-	if len(instances) == 0 {
-		logger.Warn("No E2E instances found for cleanup")
+	if len(instances) > 0 {
+		logger.WithField("instances", len(instances)).Info("Destroying tracked E2E instances")
+		s.destroyE2EInstances(instances, logger)
+	}
+
+	// Fallback: catch orphans from restarts, map overwrites, or failed goroutines
+	s.cleanupOrphanedE2EInstances(pr, logger)
+}
+
+// cleanupOrphanedE2EInstances queries the cloud API by DNS LIKE pattern and destroys any matches.
+func (s *Server) cleanupOrphanedE2EInstances(pr *model.PullRequest, logger logrus.FieldLogger) {
+	var instanceType string
+	if strings.Contains(pr.RepoName, "desktop") {
+		instanceType = "desktop"
+	} else if strings.Contains(pr.RepoName, "mobile") {
+		instanceType = "mobile"
+	} else {
+		logger.Debug("Skipping orphan E2E cleanup for non-E2E repo")
+		return
+	}
+
+	dnsPattern := fmt.Sprintf("%s-pr-%d-%%", instanceType, pr.Number) // e.g. "mobile-pr-9587-%"
+
+	installations, err := s.CloudClient.GetInstallations(&cloudModel.GetInstallationsRequest{
+		DNS:    dnsPattern,
+		Paging: cloudModel.AllPagesNotDeleted(),
+	})
+	if err != nil {
+		logger.WithError(err).Error("Failed to query cloud API for orphaned E2E instances")
+		return
+	}
+
+	if len(installations) == 0 {
+		logger.Debug("No orphaned E2E instances found via cloud API")
 		return
 	}
 
-	logger.WithField("instances", len(instances)).Info("Destroying E2E instances")
-	s.destroyE2EInstances(instances, logger)
+	logger.WithField("orphans", len(installations)).Warn("Found orphaned E2E instances via cloud API")
+	for _, inst := range installations {
+		// Skip instances already progressing through deletion to avoid redundant API calls.
+		if inst.State == cloudModel.InstallationStateDeletionPendingRequested ||
+			inst.State == cloudModel.InstallationStateDeletionPendingInProgress ||
+			inst.State == cloudModel.InstallationStateDeletionPending ||
+			inst.State == cloudModel.InstallationStateDeletionRequested ||
+			inst.State == cloudModel.InstallationStateDeletionFailed ||
+			inst.State == cloudModel.InstallationStateDeleted {
+			logger.WithField("installation_id", inst.ID).Debug("Skipping E2E instance already in deletion state")
+			continue
+		}
+		instLogger := logger.WithField("installation_id", inst.ID)
+		instLogger.Info("Destroying orphaned E2E instance")
+		if err := s.CloudClient.DeleteInstallation(inst.ID); err != nil {
+			instLogger.WithError(err).Error("Failed to destroy orphaned E2E instance")
+		}
+	}
+}
+
+// cleanupNonPRE2EInstancesOnStartup destroys stale non-PR E2E instances left over from a
+// previous matterwick run. When matterwick restarts, the in-memory tracking map is wiped,
+// making push/nightly/CMT instances untrackable — the workflow_run(completed) cleanup path
+// will find nothing. A 8-hour age threshold is used so that instances still being actively
+// used by tests that started before the restart are NOT destroyed mid-run.
+//
+// Reasoning: E2E tests (nightly, push, CMT) take at most a few hours. Any non-PR instance
+// older than 8 hours whose tracking was lost must be orphaned.
+//
+// PR instances (identified by "-pr-" in their OwnerID) are always skipped — handleE2ECleanup
+// on PR close, which includes DNS orphan cleanup, manages their lifecycle.
+func (s *Server) cleanupNonPRE2EInstancesOnStartup() {
+	const maxAge = 8 * time.Hour
+	logger := s.Logger.WithField("type", "startup_e2e_cleanup")
+	logger.WithField("max_age_hours", 8).Info("Scanning for stale non-PR E2E instances from previous matterwick run")
+
+	cutoffMs := time.Now().Add(-maxAge).UnixMilli()
+
+	for _, instanceType := range []string{"desktop", "mobile"} {
+		pattern := instanceType + "-%"
+		installations, err := s.CloudClient.GetInstallations(&cloudModel.GetInstallationsRequest{
+			DNS:    pattern,
+			Paging: cloudModel.AllPagesNotDeleted(),
+		})
+		if err != nil {
+			logger.WithError(err).Errorf("Failed to query %s instances on startup", instanceType)
+			continue
+		}
+
+		for _, inst := range installations {
+			// Guard against a nil embedded Installation — must come first, all other
+			// field accesses (OwnerID, CreateAt, State, ID) are on the embedded struct.
+			if inst.Installation == nil {
+				logger.Warn("Skipping instance with nil Installation pointer in startup cleanup")
+				continue
+			}
+
+			// PR instances have "-pr-" in their OwnerID (e.g. "mobile-pr-123-site-1-...").
+			// Skip them — handleE2ECleanup on PR close manages their lifecycle.
+			if strings.Contains(inst.OwnerID, "-pr-") {
+				continue
+			}
+
+			// Skip instances created within the last 8 hours — tests may still be running.
+			// If matterwick restarted mid-test, destroying active test instances would break them.
+			if inst.CreateAt > cutoffMs {
+				logger.WithFields(logrus.Fields{
+					"installation_id": inst.ID,
+					"owner_id":        inst.OwnerID,
+				}).Debug("Skipping non-PR instance younger than 8h (may still be in use)")
+				continue
+			}
+
+			// Skip instances already progressing through deletion.
+			if inst.State == cloudModel.InstallationStateDeletionPendingRequested ||
+				inst.State == cloudModel.InstallationStateDeletionPendingInProgress ||
+				inst.State == cloudModel.InstallationStateDeletionPending ||
+				inst.State == cloudModel.InstallationStateDeletionRequested ||
+				inst.State == cloudModel.InstallationStateDeletionFailed ||
+				inst.State == cloudModel.InstallationStateDeleted {
+				continue
+			}
+
+			instLogger := logger.WithFields(logrus.Fields{
+				"installation_id": inst.ID,
+				"owner_id":        inst.OwnerID,
+				"state":           inst.State,
+			})
+			instLogger.Warn("Destroying stale non-PR E2E instance (older than 8h) left from previous matterwick run")
+			if err := s.CloudClient.DeleteInstallation(inst.ID); err != nil {
+				instLogger.WithError(err).Error("Failed to destroy stale non-PR E2E instance")
+			}
+		}
+	}
+
+	logger.Info("Startup non-PR E2E instance cleanup complete")
 }
 
 // destroyE2EInstances destroys all given E2E instances
@@ -531,6 +732,119 @@ func (s *Server) destroyE2EInstances(instances []*E2EInstance, logger logrus.Fie
 	}
 }
 
+// findExistingE2EInstancesInCloud queries the cloud API for E2E instances that match a PR and
+// reconstructs E2EInstance objects for reuse. Returns an error if the count doesn't match
+// the expected number of platforms (indicating a partial or fully absent set).
+func (s *Server) findExistingE2EInstancesInCloud(pr *model.PullRequest, instanceType string, platforms []string) ([]*E2EInstance, error) {
+	dnsPattern := fmt.Sprintf("%s-pr-%d-%%", instanceType, pr.Number)
+
+	installations, err := s.CloudClient.GetInstallations(&cloudModel.GetInstallationsRequest{
+		DNS:    dnsPattern,
+		Paging: cloudModel.AllPagesNotDeleted(),
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to query cloud API for existing E2E instances: %w", err)
+	}
+
+	// Only reuse instances that are in a stable, usable state.
+	var reusable []*cloudModel.InstallationDTO
+	for _, inst := range installations {
+		if inst.State == cloudModel.InstallationStateStable ||
+			inst.State == cloudModel.InstallationStateHibernating {
+			reusable = append(reusable, inst)
+		}
+	}
+
+	// Parse the platform token from each OwnerID.
+	// OwnerID format: {type}-{version}-{platform}-{uid} where uid is 8 hex chars.
+	// Strip the trailing "-{uid}" (9 chars) then check which expected platform is a suffix.
+	platformByInst := make(map[string]*cloudModel.InstallationDTO, len(reusable)) // platform → inst
+	for _, inst := range reusable {
+		ownerID := inst.OwnerID
+		var matched string
+		for _, p := range platforms {
+			// OwnerID without uid ends with "-{platform}"; uid is 9 chars ("-" + 8 hex).
+			withoutUID := ownerID
+			if len(ownerID) > 9 {
+				withoutUID = ownerID[:len(ownerID)-9]
+			}
+			if strings.HasSuffix(withoutUID, "-"+p) {
+				matched = p
+				break
+			}
+		}
+		if matched == "" {
+			return nil, fmt.Errorf("could not determine platform for instance %q", ownerID)
+		}
+		if _, dup := platformByInst[matched]; dup {
+			return nil, fmt.Errorf("duplicate platform %q found among cloud instances", matched)
+		}
+		platformByInst[matched] = inst
+	}
+
+	// Validate that every expected platform is present.
+	for _, p := range platforms {
+		if _, ok := platformByInst[p]; !ok {
+			return nil, fmt.Errorf("expected platform %q not found among cloud instances (found %d, want %d)", p, len(reusable), len(platforms))
+		}
+	}
+
+	// Build result in platforms[] order so index-based assignment is stable for callers.
+	result := make([]*E2EInstance, len(platforms))
+	for i, platform := range platforms {
+		inst := platformByInst[platform]
+		e2eInst := &E2EInstance{
+			Name:           inst.OwnerID,
+			Platform:       platform,
+			URL:            fmt.Sprintf("https://%s.%s", inst.OwnerID, s.Config.DNSNameTestServer),
+			InstallationID: inst.ID,
+			ServerVersion:  inst.Version,
+		}
+		if instanceType == "desktop" {
+			e2eInst.Runner = getRunnerForPlatform(platform)
+		}
+		result[i] = e2eInst
+	}
+	return result, nil
+}
+
+// wakeUpHibernatingInstances checks each instance and wakes any that are hibernating,
+// waiting up to 10 minutes for stable state. Logs warnings on failure and proceeds.
+func (s *Server) wakeUpHibernatingInstances(instances []*E2EInstance, logger logrus.FieldLogger) {
+	for _, inst := range instances {
+		installation, err := s.CloudClient.GetInstallation(inst.InstallationID, nil)
+		if err != nil {
+			logger.WithError(err).WithField("installation_id", inst.InstallationID).Warn("Failed to check installation state before wake-up")
+			continue
+		}
+		if installation.State != cloudModel.InstallationStateHibernating {
+			continue
+		}
+		logger.WithField("installation_id", inst.InstallationID).Info("Waking up hibernating E2E instance")
+		if _, err := s.CloudClient.WakeupInstallation(inst.InstallationID, nil); err != nil {
+			logger.WithError(err).WithField("installation_id", inst.InstallationID).Warn("Failed to wake up hibernating E2E instance")
+			continue
+		}
+		timeout := time.Now().Add(10 * time.Minute)
+		for {
+			updated, err := s.CloudClient.GetInstallation(inst.InstallationID, nil)
+			if err != nil {
+				logger.WithError(err).WithField("installation_id", inst.InstallationID).Warn("Error polling installation state during wake-up")
+				break
+			}
+			if updated.State == cloudModel.InstallationStateStable {
+				logger.WithField("installation_id", inst.InstallationID).Info("Hibernating E2E instance is now stable")
+				break
+			}
+			if time.Now().After(timeout) {
+				logger.WithField("installation_id", inst.InstallationID).Warn("Timeout waiting for E2E instance to wake up")
+				break
+			}
+			time.Sleep(15 * time.Second)
+		}
+	}
+}
+
 // postE2EStartedComment posts a comment when E2E tests start
 func (s *Server) postE2EStartedComment(pr *model.PullRequest, instances []*E2EInstance) {
 	ctx := context.Background()
diff --git a/server/pull_request.go b/server/pull_request.go
index c3255b3..68d5bfb 100644
--- a/server/pull_request.go
+++ b/server/pull_request.go
@@ -65,8 +65,8 @@ func (s *Server) handlePullRequestEvent(event *github.PullRequestEvent) {
 			return
 		}
 		if s.isE2ELabel(label) {
-			logger.WithField("label", label).Info("PR E2E test label was removed")
-			go s.handleE2ECleanup(pr)
+			logger.WithField("label", label).Info("PR E2E test label was removed, canceling workflow runs but keeping servers alive")
+			go s.cancelPRWorkflowRuns(pr, logger)
 			return
 		}
 		if s.isSpinWickLabel(label) {
diff --git a/server/push_events.go b/server/push_events.go
index 1c58484..a12f45c 100644
--- a/server/push_events.go
+++ b/server/push_events.go
@@ -4,8 +4,10 @@
 package server
 
 import (
+	"context"
 	"fmt"
 	"strings"
+	"sync"
 
 	"github.com/google/go-github/v32/github"
 	"github.com/sirupsen/logrus"
@@ -135,12 +137,10 @@ func (s *Server) handlePushEventE2E(event *github.PushEvent, branch string, vers
 	logger.Info("E2E workflow triggered successfully and instances tracked for cleanup")
 }
 
-// createMultipleE2EInstancesForPushEvent creates instances for push event E2E testing
+// createMultipleE2EInstancesForPushEvent creates all platform instances in parallel.
+// Results are returned in platforms[] order so index-based assignment is stable.
 func (s *Server) createMultipleE2EInstancesForPushEvent(repoName, instanceType, branch, version, _ string) ([]*E2EInstance, error) {
-	var instances []*E2EInstance
 	var platforms []string
-
-	// For push events, always use all platforms
 	if instanceType == "desktop" {
 		platforms = []string{"linux", "macos", "windows"}
 	} else {
@@ -164,25 +164,55 @@ func (s *Server) createMultipleE2EInstancesForPushEvent(repoName, instanceType,
 	username := s.Config.E2EUsername
 	password := s.getE2EPassword(instanceType)
 
-	for _, platform := range platforms {
-		name := e2eInstanceName(
-			s.Config.DNSNameTestServer,
-			instanceType, sanitizedVersion, platform, uid,
-		)
-
-		instance, err := s.createCloudInstallation(name, serverVersion, username, password, instanceType, logger)
-		if err != nil {
-			logger.WithError(err).Errorf("Failed to create instance for platform %s", platform)
-			// Cleanup already created instances on failure
-			s.destroyE2EInstances(instances, logger)
-			return nil, err
-		}
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	type result struct {
+		instance *E2EInstance
+		err      error
+	}
+	results := make([]result, len(platforms))
+	var wg sync.WaitGroup
+
+	for i, platform := range platforms {
+		wg.Add(1)
+		go func(idx int, platform string) {
+			defer wg.Done()
+			name := e2eInstanceName(
+				s.Config.DNSNameTestServer,
+				instanceType, sanitizedVersion, platform, uid,
+			)
+			inst, err := s.createCloudInstallation(ctx, name, serverVersion, username, password, instanceType, logger)
+			if err != nil {
+				cancel()
+				results[idx] = result{err: err}
+				return
+			}
+			inst.Platform = platform
+			if instanceType == "desktop" {
+				inst.Runner = getRunnerForPlatform(platform)
+			}
+			results[idx] = result{instance: inst}
+		}(i, platform)
+	}
 
-		instance.Platform = platform
-		if instanceType == "desktop" {
-			instance.Runner = getRunnerForPlatform(platform)
+	wg.Wait()
+
+	var instances []*E2EInstance
+	var firstErr error
+	for _, r := range results {
+		if r.err != nil {
+			if firstErr == nil {
+				firstErr = r.err
+			}
+		} else {
+			instances = append(instances, r.instance)
 		}
-		instances = append(instances, instance)
+	}
+
+	if firstErr != nil {
+		s.destroyE2EInstances(instances, logger)
+		return nil, firstErr
 	}
 
 	logger.WithField("instanceCount", len(instances)).Info("All E2E instances created successfully")
diff --git a/server/server.go b/server/server.go
index 5b0fe78..1322c35 100644
--- a/server/server.go
+++ b/server/server.go
@@ -98,6 +98,11 @@ func New(config *MatterwickConfig) *Server {
 func (s *Server) Start() {
 	s.Logger.Info("Starting MatterWick Server")
 
+	// Destroy any non-PR E2E instances left over from a previous run.
+	// Must run before the HTTP listener starts so that cleanup completes before
+	// new webhook events can arrive and re-create conflicting instances.
+	s.cleanupNonPRE2EInstancesOnStartup()
+
 	s.initializeRouter()
 
 	var handler http.Handler = s.Router
diff --git a/server/workflow_run.go b/server/workflow_run.go
index abc626a..db3e5c8 100644
--- a/server/workflow_run.go
+++ b/server/workflow_run.go
@@ -9,6 +9,7 @@ import (
 	"fmt"
 	"io"
 	"strings"
+	"sync"
 
 	"github.com/sirupsen/logrus"
 )
@@ -27,6 +28,7 @@ type WorkflowRunWithInputs struct {
 	Name       string            `json:"name"`
 	HeadBranch string            `json:"head_branch"`
 	HeadSHA    string            `json:"head_sha"`
+	Event      string            `json:"event"` // triggering event: "push", "schedule", "workflow_dispatch", etc.
 	Inputs     map[string]string `json:"inputs"`
 }
 
@@ -122,7 +124,7 @@ func (s *Server) handleWorkflowRunEventWithInputs(payload *WorkflowRunWebhookPay
 	if s.Config.E2ENightlyTriggerWorkflowName != "" && workflowName == s.Config.E2ENightlyTriggerWorkflowName {
 		if payload.Action == "requested" {
 			logger.Info("Nightly trigger workflow started, provisioning E2E servers")
-			go s.handleNightlyE2ETrigger(owner, repoName, headBranch, headSHA, logger)
+			go s.handleNightlyE2ETrigger(owner, repoName, headBranch, headSHA, payload.WorkflowRun.Event, runID, logger)
 		}
 		return
 	}
@@ -137,12 +139,15 @@ func (s *Server) handleWorkflowRunEventWithInputs(payload *WorkflowRunWebhookPay
 	logger.Debug("Ignoring workflow_run event (not relevant to E2E lifecycle)")
 }
 
-// handleNightlyE2ETrigger provisions instances and dispatches the test workflow
-// for scheduled/nightly E2E runs. Called when the nightly trigger workflow starts.
-func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha string, logger logrus.FieldLogger) {
+// handleNightlyE2ETrigger provisions instances and dispatches the test workflow.
+// Called when the E2E trigger workflow starts, whether from schedule, push to master/main,
+// or push to a release branch. The triggerEvent parameter ("schedule", "push", etc.) is
+// used to set runType correctly — scheduled runs always get "NIGHTLY" regardless of branch.
+func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha, triggerEvent string, runID int64, logger logrus.FieldLogger) {
 	logger = logger.WithFields(logrus.Fields{
 		"branch": branch,
 		"sha":    sha,
+		"run_id": runID,
 	})
 	logger.Info("Provisioning nightly E2E instances")
 
@@ -160,14 +165,31 @@ func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha string, lo
 		return
 	}
 
-	// Track by sha so the test workflow completion can clean up
-	key := fmt.Sprintf("%s-scheduled-%s", repoName, sha)
+	// Include runID so two trigger runs against the same SHA (e.g. manual re-trigger)
+	// get separate tracking keys. The key still ends with "-{sha}" so
+	// findAndDestroyInstancesBySHA continues to match it by suffix.
+	key := fmt.Sprintf("%s-scheduled-%d-%s", repoName, runID, sha)
 	s.e2eInstancesLock.Lock()
 	s.e2eInstances[key] = instances
 	s.e2eInstancesLock.Unlock()
 
 	logger.WithField("tracking_key", key).Info("Nightly instances tracked, dispatching test workflow")
 
+	// Determine run classification. Scheduled runs are always NIGHTLY regardless of branch
+	// (a scheduled run on master must not be classified as MASTER). Push-triggered runs
+	// derive their type from the branch name.
+	runType := "NIGHTLY"
+	nightly := true
+	if triggerEvent != "schedule" {
+		if branch == "master" || branch == "main" {
+			runType = "MASTER"
+			nightly = false
+		} else if s.isReleaseBranch(branch) {
+			runType = "RELEASE"
+			nightly = false
+		}
+	}
+
 	var dispatchErr error
 	if instanceType == "desktop" {
 		instanceDetailsJSON, err := s.buildInstanceDetailsJSON(instances)
@@ -180,8 +202,7 @@ func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha string, lo
 			return
 		}
 		// Dispatch to the exact SHA so workflow_run completed event matches the tracking key.
-		// Pass runType="NIGHTLY" and nightly=true so the workflow correctly classifies this run.
-		dispatchErr = s.dispatchDesktopE2EWorkflow(owner, repoName, sha, sha, instanceDetailsJSON, "NIGHTLY", true)
+		dispatchErr = s.dispatchDesktopE2EWorkflow(owner, repoName, sha, sha, instanceDetailsJSON, runType, nightly)
 	} else {
 		if len(instances) < 3 {
 			logger.Errorf("Expected 3 mobile instances, got %d", len(instances))
@@ -192,9 +213,8 @@ func (s *Server) handleNightlyE2ETrigger(owner, repoName, branch, sha string, lo
 			return
 		}
 		// Dispatch to the exact SHA so workflow_run completed event matches the tracking key.
-		// Pass runType="NIGHTLY" so the workflow correctly classifies this as a nightly run.
 		dispatchErr = s.dispatchMobileE2EWorkflow(owner, repoName, sha, sha,
-			instances[0].URL, instances[1].URL, instances[2].URL, "both", "NIGHTLY")
+			instances[0].URL, instances[1].URL, instances[2].URL, "both", runType)
 	}
 
 	if dispatchErr != nil {
@@ -361,7 +381,7 @@ func (s *Server) createSingleCMTInstance(repoName, instanceType, version string,
 	username := s.Config.E2EUsername
 	password := s.getE2EPassword(instanceType)
 
-	return s.createCloudInstallation(name, version, username, password, instanceType, logger)
+	return s.createCloudInstallation(context.Background(), name, version, username, password, instanceType, logger)
 }
 
 // cmtServer is the server entry in CMT_MATRIX JSON.
@@ -495,13 +515,12 @@ func (s *Server) dispatchCMTWorkflow(repoOwner, repoName, sha, branch, cmtMatrix
 	return nil
 }
 
-// createCMTInstancesForVersion creates 3 instances (one per platform) for a given server
-// version. Used by nightly runs which dispatch the platform-aware e2e-functional.yml /
-// e2e-detox-pr.yml workflows (not the CMT matrix workflow).
+// createCMTInstancesForVersion creates 3 instances (one per platform) in parallel for a
+// given server version. Used by nightly runs which dispatch the platform-aware
+// e2e-functional.yml / e2e-detox-pr.yml workflows (not the CMT matrix workflow).
+// Results are returned in platforms[] order so index-based assignment is stable.
 func (s *Server) createCMTInstancesForVersion(repoName, instanceType, version, purpose string) ([]*E2EInstance, error) {
-	var instances []*E2EInstance
 	var platforms []string
-
 	if instanceType == "desktop" {
 		platforms = []string{"linux", "macos", "windows"}
 	} else {
@@ -521,25 +540,56 @@ func (s *Server) createCMTInstancesForVersion(repoName, instanceType, version, p
 	username := s.Config.E2EUsername
 	password := s.getE2EPassword(instanceType)
 
-	for _, platform := range platforms {
-		name := e2eInstanceName(
-			s.Config.DNSNameTestServer,
-			instanceType, sanitizedVersion, platform, uid,
-		)
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	type result struct {
+		instance *E2EInstance
+		err      error
+	}
+	results := make([]result, len(platforms))
+	var wg sync.WaitGroup
+
+	for i, platform := range platforms {
+		wg.Add(1)
+		go func(idx int, platform string) {
+			defer wg.Done()
+			name := e2eInstanceName(
+				s.Config.DNSNameTestServer,
+				instanceType, sanitizedVersion, platform, uid,
+			)
+			inst, err := s.createCloudInstallation(ctx, name, version, username, password, instanceType, logger)
+			if err != nil {
+				cancel()
+				results[idx] = result{err: err}
+				return
+			}
+			inst.Platform = platform
+			if instanceType == "desktop" {
+				inst.Runner = getRunnerForPlatform(platform)
+			}
+			results[idx] = result{instance: inst}
+		}(i, platform)
+	}
+
+	wg.Wait()
 
-		instance, err := s.createCloudInstallation(name, version, username, password, instanceType, logger)
-		if err != nil {
-			logger.WithError(err).Errorf("Failed to create instance for platform %s", platform)
-			// Cleanup already created instances on failure
-			s.destroyE2EInstances(instances, logger)
-			return nil, err
+	var instances []*E2EInstance
+	var firstErr error
+	for _, r := range results {
+		if r.err != nil {
+			if firstErr == nil {
+				firstErr = r.err
+			}
+		} else {
+			instances = append(instances, r.instance)
 		}
+	}
 
-		instance.Platform = platform
-		if instanceType == "desktop" {
-			instance.Runner = getRunnerForPlatform(platform)
-		}
-		instances = append(instances, instance)
+	if firstErr != nil {
+		logger.WithError(firstErr).Error("Failed to create one or more instances; destroying all")
+		s.destroyE2EInstances(instances, logger)
+		return nil, firstErr
 	}
 
 	logger.WithField("instanceCount", len(instances)).Info("Instances created for version")