Use-Tusk · sohil-kshirsagar · Mar 22, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
diff --git a/cmd/run.go b/cmd/run.go
@@ -391,6 +391,9 @@ func runTests(cmd *cobra.Command, args []string) error {
 	var tests []runner.Test
 	var err error
 
+	// Track overall timing for print mode (includes test loading)
+	overallStart := time.Now()
+
 	// Step 3: Load tests - in cloud mode, fetch from backend; otherwise use local files
 	deferLoadTests := interactive
 	if deferLoadTests {
@@ -457,7 +460,7 @@ func runTests(cmd *cobra.Command, args []string) error {
 		if isValidation {
 			log.Stderrln(fmt.Sprintf("\n➤ Found %d traces to validate", len(tests)))
 		} else if !cloud {
-			log.Stderrln(fmt.Sprintf("\n➤ Loaded %d tests from local traces", len(tests)))
+			log.Stderrln(fmt.Sprintf("\n➤ Loaded %d tests from local traces (%.1fs)", len(tests), time.Since(overallStart).Seconds()))
 		}
 	}
 
@@ -732,13 +735,25 @@ func runTests(cmd *cobra.Command, args []string) error {
 	}
 
 	// Step 4: Run tests by environment
+	testPhaseStart := time.Now()
 	var results []runner.TestResult
 	if groupResult != nil && len(groupResult.Groups) > 0 {
 		// Use environment-based replay
 		results, err = runner.ReplayTestsByEnvironment(context.Background(), executor, groupResult.Groups)
 		if err != nil {
 			cmd.SilenceUsage = true
 
+			// Dump startup logs so user can diagnose startup failures
+			startupLogs := executor.GetStartupLogs()
+			if startupLogs != "" {
+				log.Stderrln("\n📋 Service startup logs:")
+				for _, line := range strings.Split(strings.TrimRight(startupLogs, "\n"), "\n") {
+					log.Stderrln(line)
+				}
+				log.Stderrln("")
+			}
+			log.Stderr(executor.GetStartupFailureHelpMessage())
+
 			// Update CI status to FAILURE if in cloud mode
 			if cloud && client != nil && (ci || isValidation) {
 				statusReq := &backend.UpdateDriftRunCIStatusRequest{
@@ -780,6 +795,14 @@ func runTests(cmd *cobra.Command, args []string) error {
 				}
 			}
 
+			startupLogs := executor.GetStartupLogs()
+			if startupLogs != "" {
+				log.Stderrln("\n📋 Service startup logs:")
+				for _, line := range strings.Split(strings.TrimRight(startupLogs, "\n"), "\n") {
+					log.Stderrln(line)
+				}
+				log.Stderrln("")
+			}
 			log.Stderr(executor.GetStartupFailureHelpMessage())
 			return fmt.Errorf("failed to start environment: %w", err)
 		}
@@ -790,7 +813,7 @@ func runTests(cmd *cobra.Command, args []string) error {
 		}()
 
 		if !interactive && !quiet {
-			log.Stderrln("  ✓ Environment ready")
+			log.Stderrln(fmt.Sprintf("  ✓ Environment ready (%.1fs)", time.Since(testPhaseStart).Seconds()))
 			log.Stderrln(fmt.Sprintf("➤ Running %d tests (concurrency: %d)...\n", len(tests), executor.GetConcurrency()))
 		}
 
@@ -830,6 +853,10 @@ func runTests(cmd *cobra.Command, args []string) error {
 		outputErr = runner.OutputResultsSummary(results, outputFormat, quiet)
 	}
 
+	if !interactive && !quiet {
+		log.Stderrln(fmt.Sprintf("Total elapsed: %.1fs", time.Since(overallStart).Seconds()))
+	}
+
 	// Step 5: Upload results to backend if in cloud mode
 	// Do this before returning any error so CI status is always updated
 	if cloud && client != nil && (ci || isValidation) {

diff --git a/docs/tui-testing.md b/docs/tui-testing.md
@@ -0,0 +1,160 @@
+# CLI & TUI Testing Guide
+
+How to manually test the tusk CLI in both print mode and interactive TUI mode.
+
+## Print Mode Testing
+
+Print mode (`--print`) runs headlessly — no interactive UI. Run it directly and inspect stderr:
+
+```bash
+cd /path/to/test-project
+/path/to/tusk drift run --print 2>&1
+```
+
+Filter for specific output:
+
+```bash
+/path/to/tusk drift run --print 2>&1 | grep -E "(➤|✓|Tests:|Error:)"
+```
+
+### Testing failure scenarios
+
+To test startup failures, temporarily change the start command in `.tusk/config.yaml`:
+
+```yaml
+start:
+    command: node -e "console.log('boot log line'); console.error('some error'); process.exit(1)"
+```
+
+To test with a service that starts but behaves differently, adjust the command or example codebase as needed.
+
+**Always restore the config after testing.**
+
+## TUI Testing with tmux
+
+The TUI (interactive mode, no `--print`) requires a terminal. We use tmux for programmatic control — it lets us send keystrokes and capture output without needing to be in the terminal ourselves.
+
+### Option A: Native screenshots (recommended)
+
+Opens a real Terminal.app window with tmux inside it, then uses macOS `screencapture -l` to capture that specific window by ID. This produces pixel-perfect Retina screenshots and should always be used to verify TUI visual changes.
+
+**One-time setup:** Grant Screen Recording permission to Terminal.app in System Settings > Privacy & Security > Screen Recording.
+
+```bash
+# 1. Open Terminal.app with a tmux session
+osascript -e 'tell application "Terminal"
+    do script "tmux new-session -s tui-test -x 200 -y 55"
+end tell'
+sleep 3
+
+# 2. Resize the window to fill most of the screen (fits 14"/16" MacBook Pro)
+osascript -e 'tell application "Terminal" to set bounds of front window to {0, 25, 1700, 1100}'
+sleep 1
+
+# 3. Hide tmux status bar (otherwise a green bar appears at the bottom)
+tmux set -t tui-test status off
+
+# 4. Launch the TUI
+tmux send-keys -t tui-test 'cd /path/to/test-project && /path/to/tusk drift run' Enter
+
+# 5. Wait for the state you want to capture
+#    - Normal run with tests: ~25-30s (environment start + test execution)
+#    - Startup failure with sandbox retry: ~15-18s
+#    - Just initial render: ~3-5s
+sleep 25
+
+# 6. Navigate if needed
+tmux send-keys -t tui-test g        # go to top (select Service Logs)
+tmux send-keys -t tui-test j        # move selection down
+tmux send-keys -t tui-test J        # scroll log panel down
+tmux send-keys -t tui-test D        # half-page down in log panel
+sleep 1
+
+# 7. Find the Terminal.app window ID
+WINDOW_ID=$(python3 -c "
+import Quartz
+windows = Quartz.CGWindowListCopyWindowInfo(Quartz.kCGWindowListOptionOnScreenOnly, Quartz.kCGNullWindowID)
+for w in windows:
+    if w.get('kCGWindowOwnerName') == 'Terminal' and w.get('kCGWindowLayer', 0) == 0:
+        print(w['kCGWindowNumber'])
+        break
+")
+
+# 8. Capture the window
+screencapture -l "$WINDOW_ID" -o screenshot.png
+
+# 9. Cleanup
+tmux send-keys -t tui-test q
+sleep 2
+tmux kill-session -t tui-test
+osascript -e 'tell application "Terminal" to close front window' 2>/dev/null
+```
+
+**Output:** ~2800x1800 Retina PNG with native font rendering.
+
+**Notes:**
+- `screencapture -l` captures by window ID — the Terminal window doesn't need to be in the foreground. You can keep working in other windows.
+- The `-o` flag removes the window shadow.
+- `screencapture -l` fails silently without Screen Recording permission — you get a blank or tiny image.
+- When finding the window ID, make sure to match `kCGWindowOwnerName == 'Terminal'` — other apps (Chrome, etc.) may be in front.
+
+### Option B: Text capture (quick functional checks)
+
+Uses a detached tmux session — no visible window, no permissions needed. Good for verifying that specific text appears in the TUI or that navigation works. **Not a substitute for screenshots** when verifying layout, colors, or visual rendering.
+
+```bash
+# 1. Detached tmux session (no visible window)
+tmux new-session -d -s tui-test -x 200 -y 55
+
+# 2. Launch the TUI
+tmux send-keys -t tui-test 'cd /path/to/test-project && /path/to/tusk drift run' Enter
+sleep 25
+
+# 3. Capture the screen as plain text
+SCREEN=$(tmux capture-pane -t tui-test -p)
+
+# 4. Assert on content
+echo "$SCREEN" | grep -q "TEST EXECUTION" || echo "FAIL: header not found"
+echo "$SCREEN" | grep -q "Environment ready" || echo "FAIL: environment didn't start"
+
+# 5. Navigate and capture again
+tmux send-keys -t tui-test j
+sleep 0.5
+SCREEN=$(tmux capture-pane -t tui-test -p)
+
+# 6. Cleanup
+tmux send-keys -t tui-test q
+sleep 1
+tmux kill-session -t tui-test
+```
+
+### TUI keyboard shortcuts reference
+
+| Key       | Action                                  |
+| --------- | --------------------------------------- |
+| `j` / `k` | Select next/previous test in left panel |
+| `g` / `G` | Jump to top/bottom of test list         |
+| `u` / `d` | Half-page up/down in test list          |
+| `J` / `K` | Scroll log panel down/up                |
+| `U` / `D` | Half-page up/down in log panel          |
+| `y`       | Copy all logs                           |
+| `q`       | Quit                                    |
+
+## Recommended Dimensions
+
+| Setting      | Value | Notes                                      |
+| ------------ | ----- | ------------------------------------------ |
+| tmux columns | 200   | Wide enough for both TUI panels + detail   |
+| tmux rows    | 55    | Tall enough to see tests + logs            |
+| Window bounds | {0, 25, 1700, 1100} | Fits 14"/16" MacBook Pro (adjust for your display) |
+
+## Common Gotchas
+
+1. **Timing is critical.** The TUI renders asynchronously. Capturing too early gives an incomplete screen. When in doubt, wait longer.
+2. **tmux status bar.** The green bar at the bottom of native screenshots is tmux's status line. Hide it with `tmux set -t tui-test status off` before capturing.
+3. **Scrolling.** Content often extends below the visible area of the log panel. Send `J` or `D` keys to scroll down before capturing.
+4. **Screen Recording permission.** Native `screencapture -l` fails silently without it. Grant it to Terminal.app in System Settings > Privacy & Security > Screen Recording.
+5. **YAML quoting.** When editing config.yaml with commands containing colons or quotes, wrap the entire value in double quotes and escape inner quotes.
+6. **Restore config.** Always restore `.tusk/config.yaml` after testing with modified start commands.
+7. **tmux targets by session name** (`-t tui-test`), so commands work regardless of which terminal you're focused on. You can keep working while tests run.
+8. **Window ID targeting.** When using `screencapture -l`, make sure the Python script finds the Terminal window, not Chrome or another app that may be in front.
diff --git a/internal/agent/prompts/phase_simple_test.md b/internal/agent/prompts/phase_simple_test.md
@@ -44,9 +44,10 @@ TuskDrift.initialize({
 Run tusk_run to replay the trace.
 If it fails:
 
-- Run with `debug: true` (keep running it in debug mode until it passes)
-- If startup fails in sandbox (for example secret manager bootstrapping), retry with `sandbox_mode: "off"`
-- Check for errors in the output or in the logs (in .tusk/logs/). Logs only appear if `debug: true` is set.
+- Check the service startup logs for errors (always shown on startup failure)
+- If the service starts but a test replay fails, run with `debug: true` to see runtime logs (keep running it in debug mode until it passes)
+- If you need more detail from the SDK itself, set `logLevel: "debug"` in the SDK initialization to see SDK-level diagnostics
+- If startup fails in sandbox (for example secret manager bootstrapping), retry with `sandbox_mode: "off"` and if that works, add a comment in config.yaml explaining why sandbox was disabled (e.g., `# sandbox disabled: service requires external secret manager during startup`)
 - If you see config-related errors (e.g., "no start command"), run `tusk_validate_config` to check for config issues
 - Try to fix issues and retry (max 3 attempts)
 - If still failing, ask the user for help

diff --git a/internal/agent/prompts/phase_verify_complex_test.md b/internal/agent/prompts/phase_verify_complex_test.md
@@ -29,7 +29,7 @@ Follow the same process as the simple test:
 4. Wait 3 seconds for trace to be written
 5. Stop the service
 6. Run tusk_list to verify trace was recorded
-7. Run tusk_run to replay (try with `debug: true` on failure; if startup fails in sandbox, retry with `sandbox_mode: "off"`)
+7. Run tusk_run to replay (startup logs are always shown on startup failure; if a test replay fails, retry with `debug: true` for runtime logs; if startup fails in sandbox, retry with `sandbox_mode: "off"` and if that works, add a comment in config.yaml explaining why)
 
 ### Step 3: Save to Cache
 

diff --git a/internal/agent/prompts/phase_verify_simple_test.md b/internal/agent/prompts/phase_verify_simple_test.md
@@ -32,8 +32,10 @@ If no traces appear, this is a verification failure.
 Run tusk_run to replay the trace.
 If it fails:
 
-- Run with `debug: true` once more
-- If startup fails in sandbox, retry with `sandbox_mode: "off"`
+- Check the service startup logs for errors (always shown on startup failure)
+- If the service starts but a test replay fails, retry with `debug: true` to see runtime logs
+- If you need more detail from the SDK, set `logLevel: "debug"` in the SDK initialization
+- If startup fails in sandbox, retry with `sandbox_mode: "off"` and if that works, add a comment in config.yaml explaining why sandbox was disabled
 - If still failing, mark as failed
 
 ### Step 5: Save to Cache

diff --git a/internal/log/log.go b/internal/log/log.go
@@ -2,6 +2,7 @@
 package log
 
 import (
+	"context"
 	"io"
 	"log/slog"
 	"os"
@@ -90,15 +91,24 @@ func (l *Logger) process() {
 
 func (l *Logger) handleLogMessage(msg logMessage) {
 	tuiPtr := l.tuiLogger.Load()
-	if tuiPtr == nil {
+	if tuiPtr != nil {
+		tui := *tuiPtr
+		switch msg.msgType {
+		case logTypeService:
+			tui.LogToService(msg.message)
+		case logTypeTest:
+			tui.LogToCurrentTest(msg.testID, msg.message)
+		}
 		return
 	}
-	tui := *tuiPtr
+
+	// No TUI active (print mode). Route through slog so messages respect
+	// the configured log level and use the standard slog format.
 	switch msg.msgType {
 	case logTypeService:
-		tui.LogToService(msg.message)
+		slog.Debug(msg.message)
 	case logTypeTest:
-		tui.LogToCurrentTest(msg.testID, msg.message)
+		slog.Debug(msg.message, "testID", msg.testID)
 	}
 }
 
@@ -261,6 +271,15 @@ func TestLog(testID, msg string) {
 	}
 }
 
+// TestDebug logs a debug-level message to a specific test's log panel.
+// Only emits when debug logging is enabled (--debug flag).
+func TestDebug(testID, msg string) {
+	if !slog.Default().Enabled(context.Background(), slog.LevelDebug) {
+		return
+	}
+	TestLog(testID, msg)
+}
+
 // TestOrServiceLog tries to log to test, falls back to service if testID is empty
 func TestOrServiceLog(testID, msg string) {
 	if testID != "" {

diff --git a/internal/runner/environment.go b/internal/runner/environment.go
@@ -25,6 +25,19 @@ func (e *Executor) StartEnvironment() error {
 		if e.GetEffectiveSandboxMode() == SandboxModeAuto && e.lastServiceSandboxed {
 			log.ServiceLog("⚠️  Service failed to start in sandbox; retrying once without sandbox...")
 			_ = e.StopService()
+
+			// Write separator so the user can see where the retry begins.
+			// The in-memory buffer survives StopService; the file path persists
+			// via serviceLogPath and setupServiceLogging will reopen in append mode.
+			if e.enableServiceLogs && e.serviceLogPath != "" {
+				if f, err := os.OpenFile(e.serviceLogPath, os.O_APPEND|os.O_WRONLY, 0o600); err == nil { // #nosec G304
+					_, _ = f.WriteString("⚠️ Retrying without sandbox...\n")
+					_ = f.Close()
+				}
+			} else if e.startupLogBuffer != nil {
+				_, _ = e.startupLogBuffer.Write([]byte("⚠️ Retrying without sandbox...\n"))
+			}
+
 			e.sandboxBypass = true
 			e.lastServiceSandboxed = false
 
@@ -52,6 +65,11 @@ waitForSDK:
 	log.ServiceLog("✅ SDK acknowledged")
 
 	log.Debug("Environment is ready")
+
+	// Discard the in-memory startup buffer now that startup succeeded.
+	// File-based logging (--enable-service-logs) persists for the full run.
+	e.DiscardStartupBuffer()
+
 	return nil
 }