From a8445a19640e2ee25a62dc83f2c8824d791eefb2 Mon Sep 17 00:00:00 2001
From: Derek Xu <derek@fireworks.ai>
Date: Wed, 13 Aug 2025 18:33:45 +0000
Subject: [PATCH 1/7] e2e smoke test

---
 .github/workflows/e2e-smoke-test.yml   | 232 ++++++++++++++++++++++++
 tests/pytest/test_tau_bench_airline.py |   2 +-
 tests/test_tau_bench_airline_smoke.py  | 236 +++++++++++++++++++++++++
 3 files changed, 469 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/e2e-smoke-test.yml
 create mode 100644 tests/test_tau_bench_airline_smoke.py

diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml
new file mode 100644
index 00000000..47061980
--- /dev/null
+++ b/.github/workflows/e2e-smoke-test.yml
@@ -0,0 +1,232 @@
+name: E2E Smoke Test
+
+# Run every 6 hours: at 00:00, 06:00, 12:00, and 18:00 UTC
+on:
+  schedule:
+    - cron: '0 */6 * * *'
+  workflow_dispatch: # Allow manual triggering
+    inputs:
+      debug_mode:
+        description: 'Enable debug output'
+        required: false
+        default: 'false'
+        type: boolean
+
+jobs:
+  e2e-smoke-test:
+    name: E2E Smoke Test
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+
+      - name: Install the project
+        run: uv sync --locked --all-extras --dev
+
+      - name: Install tau2 for testing
+        run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
+
+      - name: Run E2E Smoke Test
+        id: run_test
+        env:
+          FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
+          FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
+        run: |
+          echo "Running e2e smoke test..."
+
+          # Run the test and capture both stdout and exit code
+          set +e  # Don't exit on failure
+
+          uv run pytest tests/test_tau_bench_airline_smoke.py::test_tau_bench_airline_smoke_evaluation \
+            -v --tb=short --durations=10 \
+            --ep-print-summary \
+            --ep-summary-json=ep_summary.json
+
+          TEST_EXIT_CODE=$?
+
+          echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT
+
+          # Parse evaluation protocol summary if it exists
+          if [ -f ep_summary.json ]; then
+            echo "EP Summary found, parsing..."
+
+            # Log the full summary for debugging
+            echo "EP Summary contents:"
+            cat ep_summary.json | jq . 2>/dev/null || cat ep_summary.json
+
+            # Extract success rate from EP summary (this contains the actual accuracy/success rate)
+            # The EP summary uses 'agg_score' for the aggregated success rate
+            SUCCESS_RATE=$(jq -r '.agg_score // 0' ep_summary.json 2>/dev/null || echo "0")
+
+            echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT
+
+            # Check if success rate meets thresholds (40% - 90% acceptable range)
+            LOWER_BOUND=0.4  # 40%
+            UPPER_BOUND=0.9  # 90%
+            LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l)
+            UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l)
+            THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l)
+
+            echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT
+            echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT
+            echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT
+
+            # Extract additional info for display
+            NUM_ROWS=$(jq -r '.rows // 0' ep_summary.json 2>/dev/null || echo "0")
+            NUM_RUNS=$(jq -r '.num_runs // 0' ep_summary.json 2>/dev/null || echo "0")
+
+            echo "📊 Evaluation Summary:"
+            echo "  - Success rate (agg_score): $(echo "$SUCCESS_RATE * 100" | bc -l)%"
+            echo "  - Dataset rows evaluated: $NUM_ROWS"
+            echo "  - Number of runs: $NUM_RUNS"
+            echo "  - Lower bound (≥40%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
+            echo "  - Upper bound (≤90%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
+            echo "  - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
+          else
+            echo "❌ No EP summary file found"
+            echo "threshold_met=0" >> $GITHUB_OUTPUT
+            echo "success_rate=0" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: e2e-smoke-test-results-${{ github.run_number }}
+          path: |
+            ep_summary.json
+            *.log
+          retention-days: 7
+
+      - name: Validate test results
+        if: always()
+        run: |
+          echo "Validating test results against thresholds..."
+
+          TEST_EXIT_CODE="${{ steps.run_test.outputs.test_exit_code }}"
+          THRESHOLD_MET="${{ steps.run_test.outputs.threshold_met }}"
+          LOWER_BOUND_MET="${{ steps.run_test.outputs.lower_bound_met }}"
+          UPPER_BOUND_MET="${{ steps.run_test.outputs.upper_bound_met }}"
+          SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}"
+
+          echo "Test exit code: $TEST_EXIT_CODE"
+          echo "Threshold met (40%-90%): $THRESHOLD_MET"
+          echo "Lower bound met (≥40%): $LOWER_BOUND_MET"
+          echo "Upper bound met (≤90%): $UPPER_BOUND_MET"
+          echo "Success rate: $SUCCESS_RATE"
+
+          # Fail the job if tests didn't run successfully or thresholds weren't met
+          if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then
+            echo "❌ E2E smoke test FAILED"
+            echo "   - Test execution failed (exit code: $TEST_EXIT_CODE)"
+            echo "   - Success rate outside acceptable range (required: 40%-90%, actual: ${SUCCESS_RATE:-unknown})"
+            exit 1
+          elif [ "$TEST_EXIT_CODE" != "0" ]; then
+            echo "⚠️ E2E smoke test had test execution issues but may have met thresholds"
+            echo "   - Test exit code: $TEST_EXIT_CODE"
+            echo "   - Thresholds met: $THRESHOLD_MET"
+            # Don't exit with error if thresholds were actually met despite test issues
+            if [ "$THRESHOLD_MET" = "1" ]; then
+              echo "✅ Thresholds met despite execution issues - considering this a pass"
+            else
+              exit 1
+            fi
+          elif [ "$THRESHOLD_MET" != "1" ]; then
+            # Determine which bound was violated
+            if [ "$LOWER_BOUND_MET" != "1" ]; then
+              echo "❌ E2E smoke test FAILED - success rate too low"
+              echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
+              echo "   - Required: ≥40%"
+            elif [ "$UPPER_BOUND_MET" != "1" ]; then
+              echo "❌ E2E smoke test FAILED - success rate suspiciously high"
+              echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
+              echo "   - Maximum expected: ≤90%"
+              echo "   - This may indicate test issues or unrealistic performance"
+            else
+              echo "❌ E2E smoke test FAILED - success rate outside acceptable range"
+              echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
+              echo "   - Required range: 40%-90%"
+            fi
+            exit 1
+          else
+            echo "✅ E2E smoke test PASSED"
+            echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
+            echo "   - Within acceptable range: 40%-90%"
+          fi
+
+      - name: Create GitHub issue on failure
+        if: failure()
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const testResults = {
+              exitCode: '${{ steps.run_test.outputs.test_exit_code }}',
+              successRate: '${{ steps.run_test.outputs.success_rate }}',
+              thresholdMet: '${{ steps.run_test.outputs.threshold_met }}',
+              lowerBoundMet: '${{ steps.run_test.outputs.lower_bound_met }}',
+              upperBoundMet: '${{ steps.run_test.outputs.upper_bound_met }}'
+            };
+
+            const title = `🚨 E2E Smoke Test Failed (${new Date().toISOString().split('T')[0]})`;
+
+            const body = `
+            ## E2E Smoke Test Failure Report
+
+            **Test:** E2E Smoke Test
+            **Date:** ${new Date().toISOString()}
+            **Workflow Run:** [#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
+
+                         ### Test Results
+             - **Success Rate:** ${testResults.successRate ? (parseFloat(testResults.successRate) * 100).toFixed(1) + '%' : 'Unknown'}
+             - **Lower Bound Met (≥40%):** ${testResults.lowerBoundMet === '1' ? '✅ Yes' : '❌ No'}
+             - **Upper Bound Met (≤90%):** ${testResults.upperBoundMet === '1' ? '✅ Yes' : '❌ No'}
+             - **Within Range (40%-90%):** ${testResults.thresholdMet === '1' ? '✅ Yes' : '❌ No'}
+             - **Test Exit Code:** ${testResults.exitCode || 'Unknown'}
+
+             ### Required Actions
+
+             ${ testResults.thresholdMet !== '1' ?
+               (testResults.lowerBoundMet !== '1' ?
+                 '🔍 **Performance Issue:** The success rate is below the required 40% minimum threshold. This indicates potential issues with model performance or test environment.' :
+                 testResults.upperBoundMet !== '1' ?
+                 '⚠️ **Suspiciously High Performance:** The success rate exceeds 90%, which may indicate test issues, data leakage, or unrealistic performance.' :
+                 '🔍 **Performance Issue:** The success rate is outside the acceptable 40%-90% range.'
+               ) :
+               '🔧 **Infrastructure Issue:** Tests failed to execute properly despite potentially meeting performance thresholds.'
+             }
+
+            ### Next Steps
+            1. Review the [workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for detailed error information
+            2. Check if this is a temporary issue by re-running the workflow manually
+            3. If persistent, investigate potential causes:
+               - Model performance degradation
+               - Test environment configuration
+               - API key or service availability issues
+
+            ### Auto-generated
+            This issue was automatically created by the E2E smoke test workflow.
+            `;
+
+            // Create the issue
+            await github.rest.issues.create({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              title: title,
+              body: body,
+              labels: ['bug', 'e2e-test', 'automated', 'smoke-test']
+            });
diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py
index 80aadf14..f5472092 100644
--- a/tests/pytest/test_tau_bench_airline.py
+++ b/tests/pytest/test_tau_bench_airline.py
@@ -65,7 +65,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
     input_dataset=["tests/pytest/data/airline_dataset.jsonl"],
     dataset_adapter=tau_bench_airline_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
-    rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096, "reasoning_effort": "low"}],
+    rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}],
     rollout_processor=default_mcp_gym_rollout_processor,
     passed_threshold={"success": 0.4, "standard_deviation": 0.1},
     num_runs=8,
diff --git a/tests/test_tau_bench_airline_smoke.py b/tests/test_tau_bench_airline_smoke.py
new file mode 100644
index 00000000..e0eae2ef
--- /dev/null
+++ b/tests/test_tau_bench_airline_smoke.py
@@ -0,0 +1,236 @@
+"""
+Smoke test for tau bench airline evaluation - runs with minimal configuration for CI/CD monitoring.
+
+This is a lightweight version of the full tau bench airline test, designed specifically
+for automated smoke testing in CI/CD pipelines. It runs with only 1 iteration to provide
+quick feedback on system health while minimizing resource usage.
+"""
+
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List
+
+from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message
+from eval_protocol.pytest import evaluation_test
+from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
+from vendor.tau2.data_model.message import (
+    AssistantMessage,
+    SystemMessage,
+    ToolCall,
+    ToolMessage,
+    UserMessage,
+)
+from vendor.tau2.data_model.tasks import Action, EvaluationCriteria, RewardType, Task, UserScenario
+from vendor.tau2.evaluator.evaluator import EnvironmentEvaluator
+from vendor.tau2.evaluator.evaluator_action import ActionEvaluator
+from vendor.tau2.evaluator.evaluator_communicate import CommunicateEvaluator
+from vendor.tau2.evaluator.evaluator_nl_assertions import NLAssertionsEvaluator
+from vendor.tau2.registry import registry
+
+
+def tau_bench_airline_smoke_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
+    """
+    Convert entries from airline dataset to EvaluationRow objects for smoke testing.
+    """
+    rows = []
+    test_dir = Path(__file__).parent.parent / "examples" / "tau2_mcp" / "tests"
+
+    # Load system prompt from file so we can change it in one place
+    domain = data[0]["environment_context"]["domain"]
+    prompt_file = test_dir / f"system_prompts/{domain}_agent_system_prompt.md"
+
+    with open(prompt_file, "r") as f:
+        system_prompt = f.read().strip()
+
+    for row in data:
+        eval_row = EvaluationRow(
+            messages=[Message(role="system", content=system_prompt)],
+            input_metadata=InputMetadata(
+                row_id=row["id"],
+                dataset_info={
+                    "environment_context": row["environment_context"],
+                    "user_simulation": row["user_simulation"],
+                    "evaluation_criteria": row["evaluation_criteria"],
+                    "user_prompt_template": row["user_prompt_template"],
+                },
+            ),
+        )
+
+        rows.append(eval_row)
+
+    return rows
+
+
+@evaluation_test(
+    input_dataset=["tests/pytest/data/airline_dataset.jsonl"],
+    dataset_adapter=tau_bench_airline_smoke_to_evaluation_row,
+    model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
+    rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}],
+    rollout_processor=default_mcp_gym_rollout_processor,
+    passed_threshold=0.4,
+    num_runs=1,  # Smoke test: single run for quick feedback
+    mode="pointwise",
+    max_concurrent_rollouts=50,  # Standard concurrency
+    server_script_path="examples/tau2_mcp/server.py",
+)
+def test_tau_bench_airline_smoke_evaluation(row: EvaluationRow) -> EvaluationRow:
+    """
+    Smoke test for tau bench airline evaluation - single run version for CI/CD monitoring.
+
+    This is a lightweight smoke test that runs the tau bench airline evaluation with
+    minimal configuration (1 run) to quickly validate system health and model performance.
+    It uses the same evaluation logic as the full test but with reduced resource usage.
+
+    Args:
+        row: EvaluationRow object from tau bench airline dataset after rollout
+
+    Returns:
+        EvaluationRow with tau2 evaluation results
+    """
+    messages = row.messages
+
+    # Get evaluation criteria and user_simulation from input_metadata.dataset_info
+    dataset_info = row.input_metadata.dataset_info if row.input_metadata else {}
+    evaluation_criteria = dataset_info.get("evaluation_criteria", {})
+
+    nl_assertions = evaluation_criteria.get("nl_assertions", [])
+    communicate_info = evaluation_criteria.get("communicate_info", [])
+    actions = evaluation_criteria.get("actions", [])
+
+    # Convert Message objects directly to tau2-bench message objects
+    trajectory_objects = []
+    for msg in messages:
+        role = msg.role
+        content = msg.content
+
+        if role == "system":
+            trajectory_objects.append(SystemMessage(role=role, content=content))
+        elif role == "assistant":
+            tau2_tool_calls = []
+            if msg.tool_calls:
+                for tool_call in msg.tool_calls:
+                    arguments = json.loads(tool_call.function.arguments)
+                    tau2_tool_call = ToolCall(
+                        id=tool_call.id,
+                        name=tool_call.function.name,
+                        arguments=arguments,
+                    )
+                    tau2_tool_calls.append(tau2_tool_call)
+
+            trajectory_objects.append(AssistantMessage(role=role, content=content, tool_calls=tau2_tool_calls))
+        elif role == "user":
+            trajectory_objects.append(UserMessage(role=role, content=content))
+        elif role == "tool":
+            tool_id = msg.tool_call_id
+            trajectory_objects.append(ToolMessage(id=tool_id, role=role, content=content))
+
+    reward = 1.0
+
+    evaluation_criteria = EvaluationCriteria(
+        nl_assertions=nl_assertions,
+        communicate_info=communicate_info,
+        actions=actions,
+        reward_basis=[
+            RewardType.DB,
+            RewardType.COMMUNICATE,
+        ],
+    )
+
+    task = Task(
+        id="SmokeTest", evaluation_criteria=evaluation_criteria, user_scenario=UserScenario(instructions="SmokeTest")
+    )  # id and user_scenario are required for the Task type but not used in calculating reward
+
+    if RewardType.DB in task.evaluation_criteria.reward_basis:
+        env_reward_info = EnvironmentEvaluator.calculate_reward(
+            environment_constructor=registry.get_env_constructor("airline"),
+            task=task,
+            full_trajectory=trajectory_objects,
+        )
+    if RewardType.ACTION in task.evaluation_criteria.reward_basis:
+        action_reward_info = ActionEvaluator.calculate_reward(
+            task=task,
+            full_trajectory=trajectory_objects,
+        )
+    if RewardType.COMMUNICATE in task.evaluation_criteria.reward_basis:
+        communicate_reward_info = CommunicateEvaluator.calculate_reward(
+            task=task,
+            full_trajectory=trajectory_objects,
+        )
+    if RewardType.NL_ASSERTION in task.evaluation_criteria.reward_basis:
+        nl_reward_info = NLAssertionsEvaluator.calculate_reward(
+            task=task,
+            full_trajectory=trajectory_objects,
+        )
+
+    reward = 1.0
+    env_bases = {RewardType.DB, RewardType.ENV_ASSERTION}
+    action_bases = {RewardType.ACTION}
+    nl_bases = {RewardType.NL_ASSERTION}
+    comm_bases = {RewardType.COMMUNICATE}
+    task_reward_basis = set(task.evaluation_criteria.reward_basis)
+
+    reward_breakdown = {}
+    if task_reward_basis & env_bases:
+        if env_reward_info.reward_breakdown is not None:
+            reward_breakdown.update(env_reward_info.reward_breakdown)
+        reward *= env_reward_info.reward
+    if task_reward_basis & action_bases:
+        if action_reward_info.reward_breakdown is not None:
+            reward_breakdown.update(action_reward_info.reward_breakdown)
+        reward *= action_reward_info.reward
+    if task_reward_basis & nl_bases:
+        if nl_reward_info.reward_breakdown is not None:
+            reward_breakdown.update(nl_reward_info.reward_breakdown)
+        reward *= nl_reward_info.reward
+    if task_reward_basis & comm_bases:
+        if communicate_reward_info.reward_breakdown is not None:
+            reward_breakdown.update(communicate_reward_info.reward_breakdown)
+        reward *= communicate_reward_info.reward
+
+    # Generate reason showing only failed components
+    failed_reasons = []
+
+    if task_reward_basis & env_bases and env_reward_info.reward == 0:
+        failed_reasons.append("❌ Environment/DB check failed")
+
+    if task_reward_basis & action_bases and action_reward_info.reward == 0:
+        failed_actions = []
+        if hasattr(action_reward_info, "action_checks") and action_reward_info.action_checks:
+            failed_actions = [
+                f"{ac.action.name}({ac.action.arguments})"
+                for ac in action_reward_info.action_checks
+                if not ac.action_match
+            ]
+        if failed_actions:
+            failed_reasons.append(f"❌ Failed actions: {failed_actions}")
+        else:
+            failed_reasons.append("❌ Actions failed")
+
+    if task_reward_basis & nl_bases and nl_reward_info.reward == 0:
+        failed_nl = []
+        if hasattr(nl_reward_info, "nl_assertions") and nl_reward_info.nl_assertions:
+            failed_nl = [nla.nl_assertion for nla in nl_reward_info.nl_assertions if not nla.met]
+        if failed_nl:
+            failed_reasons.append(f"❌ Failed NL assertions: {failed_nl}")
+        else:
+            failed_reasons.append("❌ NL Assertions failed")
+
+    if task_reward_basis & comm_bases and communicate_reward_info.reward == 0:
+        failed_comm = []
+        if hasattr(communicate_reward_info, "communicate_checks") and communicate_reward_info.communicate_checks:
+            failed_comm = [cc.info for cc in communicate_reward_info.communicate_checks if not cc.met]
+        if failed_comm:
+            failed_reasons.append(f"❌ Failed communication: {failed_comm}")
+        else:
+            failed_reasons.append("❌ Communication failed")
+
+    # If everything passed, show success
+    reason = "\n".join(failed_reasons) if failed_reasons else "✅ All checks passed [SMOKE TEST]"
+
+    row.evaluation_result = EvaluateResult(
+        score=reward,
+        reason=reason,
+        metrics={},
+    )
+    return row

From e001fd60fce878bfeff78bbfd88d7d3a0523c580 Mon Sep 17 00:00:00 2001
From: Derek Xu <derek@fireworks.ai>
Date: Wed, 13 Aug 2025 18:44:29 +0000
Subject: [PATCH 2/7] temp adding

---
 .github/workflows/e2e-smoke-test.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml
index 47061980..d934eaf4 100644
--- a/.github/workflows/e2e-smoke-test.yml
+++ b/.github/workflows/e2e-smoke-test.yml
@@ -2,6 +2,16 @@ name: E2E Smoke Test
 
 # Run every 6 hours: at 00:00, 06:00, 12:00, and 18:00 UTC
 on:
+  push:
+    branches: [main]
+    paths-ignore:
+      - "docs/**"
+      - "*.md"
+  pull_request:
+    branches: [main]
+    paths-ignore:
+      - "docs/**"
+      - "*.md"
   schedule:
     - cron: '0 */6 * * *'
   workflow_dispatch: # Allow manual triggering

From b39cc40b1496a8200cb1b6eea672294d7076305a Mon Sep 17 00:00:00 2001
From: Derek Xu <derek@fireworks.ai>
Date: Wed, 13 Aug 2025 18:55:03 +0000
Subject: [PATCH 3/7] update

---
 .github/workflows/e2e-smoke-test.yml | 90 +++++++---------------------
 1 file changed, 21 insertions(+), 69 deletions(-)

diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml
index d934eaf4..bdd1de2c 100644
--- a/.github/workflows/e2e-smoke-test.yml
+++ b/.github/workflows/e2e-smoke-test.yml
@@ -71,17 +71,31 @@ jobs:
 
           echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT
 
+          # List generated files for debugging
+          echo "📁 Generated files:"
+          ls -la *.json 2>/dev/null || echo "No JSON files found"
+          ls -la ep_summary* 2>/dev/null || echo "No ep_summary files found"
+
           # Parse evaluation protocol summary if it exists
-          if [ -f ep_summary.json ]; then
-            echo "EP Summary found, parsing..."
+          # EP might generate files with different names, check for common patterns
+          EP_SUMMARY_FILE=""
+          for file in ep_summary*.json; do
+            if [ -f "$file" ]; then
+              EP_SUMMARY_FILE="$file"
+              break
+            fi
+          done
+
+          if [ -n "$EP_SUMMARY_FILE" ] && [ -f "$EP_SUMMARY_FILE" ]; then
+            echo "EP Summary found: $EP_SUMMARY_FILE, parsing..."
 
             # Log the full summary for debugging
             echo "EP Summary contents:"
-            cat ep_summary.json | jq . 2>/dev/null || cat ep_summary.json
+            cat "$EP_SUMMARY_FILE" | jq . 2>/dev/null || cat "$EP_SUMMARY_FILE"
 
             # Extract success rate from EP summary (this contains the actual accuracy/success rate)
             # The EP summary uses 'agg_score' for the aggregated success rate
-            SUCCESS_RATE=$(jq -r '.agg_score // 0' ep_summary.json 2>/dev/null || echo "0")
+            SUCCESS_RATE=$(jq -r '.agg_score // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0")
 
             echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT
 
@@ -97,8 +111,8 @@ jobs:
             echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT
 
             # Extract additional info for display
-            NUM_ROWS=$(jq -r '.rows // 0' ep_summary.json 2>/dev/null || echo "0")
-            NUM_RUNS=$(jq -r '.num_runs // 0' ep_summary.json 2>/dev/null || echo "0")
+            NUM_ROWS=$(jq -r '.rows // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0")
+            NUM_RUNS=$(jq -r '.num_runs // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0")
 
             echo "📊 Evaluation Summary:"
             echo "  - Success rate (agg_score): $(echo "$SUCCESS_RATE * 100" | bc -l)%"
@@ -119,7 +133,7 @@ jobs:
         with:
           name: e2e-smoke-test-results-${{ github.run_number }}
           path: |
-            ep_summary.json
+            ep_summary*.json
             *.log
           retention-days: 7
 
@@ -178,65 +192,3 @@ jobs:
             echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
             echo "   - Within acceptable range: 40%-90%"
           fi
-
-      - name: Create GitHub issue on failure
-        if: failure()
-        uses: actions/github-script@v7
-        with:
-          script: |
-            const testResults = {
-              exitCode: '${{ steps.run_test.outputs.test_exit_code }}',
-              successRate: '${{ steps.run_test.outputs.success_rate }}',
-              thresholdMet: '${{ steps.run_test.outputs.threshold_met }}',
-              lowerBoundMet: '${{ steps.run_test.outputs.lower_bound_met }}',
-              upperBoundMet: '${{ steps.run_test.outputs.upper_bound_met }}'
-            };
-
-            const title = `🚨 E2E Smoke Test Failed (${new Date().toISOString().split('T')[0]})`;
-
-            const body = `
-            ## E2E Smoke Test Failure Report
-
-            **Test:** E2E Smoke Test
-            **Date:** ${new Date().toISOString()}
-            **Workflow Run:** [#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
-
-                         ### Test Results
-             - **Success Rate:** ${testResults.successRate ? (parseFloat(testResults.successRate) * 100).toFixed(1) + '%' : 'Unknown'}
-             - **Lower Bound Met (≥40%):** ${testResults.lowerBoundMet === '1' ? '✅ Yes' : '❌ No'}
-             - **Upper Bound Met (≤90%):** ${testResults.upperBoundMet === '1' ? '✅ Yes' : '❌ No'}
-             - **Within Range (40%-90%):** ${testResults.thresholdMet === '1' ? '✅ Yes' : '❌ No'}
-             - **Test Exit Code:** ${testResults.exitCode || 'Unknown'}
-
-             ### Required Actions
-
-             ${ testResults.thresholdMet !== '1' ?
-               (testResults.lowerBoundMet !== '1' ?
-                 '🔍 **Performance Issue:** The success rate is below the required 40% minimum threshold. This indicates potential issues with model performance or test environment.' :
-                 testResults.upperBoundMet !== '1' ?
-                 '⚠️ **Suspiciously High Performance:** The success rate exceeds 90%, which may indicate test issues, data leakage, or unrealistic performance.' :
-                 '🔍 **Performance Issue:** The success rate is outside the acceptable 40%-90% range.'
-               ) :
-               '🔧 **Infrastructure Issue:** Tests failed to execute properly despite potentially meeting performance thresholds.'
-             }
-
-            ### Next Steps
-            1. Review the [workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for detailed error information
-            2. Check if this is a temporary issue by re-running the workflow manually
-            3. If persistent, investigate potential causes:
-               - Model performance degradation
-               - Test environment configuration
-               - API key or service availability issues
-
-            ### Auto-generated
-            This issue was automatically created by the E2E smoke test workflow.
-            `;
-
-            // Create the issue
-            await github.rest.issues.create({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              title: title,
-              body: body,
-              labels: ['bug', 'e2e-test', 'automated', 'smoke-test']
-            });

From 02e9b932e87a92f9f5cccdcdd0af4b21b0be0fc0 Mon Sep 17 00:00:00 2001
From: Derek Xu <derek@fireworks.ai>
Date: Wed, 13 Aug 2025 20:04:06 +0000
Subject: [PATCH 4/7] test

---
 .github/workflows/e2e-smoke-test.yml | 108 ++++++++++++++-------------
 1 file changed, 56 insertions(+), 52 deletions(-)

diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml
index bdd1de2c..7521b3cb 100644
--- a/.github/workflows/e2e-smoke-test.yml
+++ b/.github/workflows/e2e-smoke-test.yml
@@ -65,7 +65,7 @@ jobs:
           uv run pytest tests/test_tau_bench_airline_smoke.py::test_tau_bench_airline_smoke_evaluation \
             -v --tb=short --durations=10 \
             --ep-print-summary \
-            --ep-summary-json=ep_summary.json
+            --ep-summary-json=ep_summary.json 2>&1 | tee test_output.log
 
           TEST_EXIT_CODE=$?
 
@@ -76,53 +76,56 @@ jobs:
           ls -la *.json 2>/dev/null || echo "No JSON files found"
           ls -la ep_summary* 2>/dev/null || echo "No ep_summary files found"
 
-          # Parse evaluation protocol summary if it exists
-          # EP might generate files with different names, check for common patterns
-          EP_SUMMARY_FILE=""
-          for file in ep_summary*.json; do
-            if [ -f "$file" ]; then
-              EP_SUMMARY_FILE="$file"
-              break
+          # Parse EP summary from terminal output (more reliable than JSON files)
+          if [ -f test_output.log ]; then
+            echo "📋 Parsing EP summary from terminal output..."
+
+            # Show the terminal output for debugging
+            echo "Terminal output:"
+            cat test_output.log
+            echo ""
+
+            # Extract the EP Summary line from the terminal output
+            EP_SUMMARY_LINE=$(grep "EP Summary |" test_output.log 2>/dev/null || echo "")
+
+            if [ -n "$EP_SUMMARY_LINE" ]; then
+              echo "Found EP Summary line:"
+              echo "$EP_SUMMARY_LINE"
+
+              # Parse the agg score from the line: "EP Summary | ... agg=0.420 ..."
+              SUCCESS_RATE=$(echo "$EP_SUMMARY_LINE" | grep -o "agg=[0-9.]*" | cut -d= -f2 2>/dev/null || echo "0")
+
+              # Extract other info
+              NUM_RUNS=$(echo "$EP_SUMMARY_LINE" | grep -o "runs=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0")
+              NUM_ROWS=$(echo "$EP_SUMMARY_LINE" | grep -o "rows=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0")
+
+              echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT
+
+              # Check if success rate meets thresholds (40% - 90% acceptable range)
+              LOWER_BOUND=0.4  # 40%
+              UPPER_BOUND=0.9  # 90%
+              LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l 2>/dev/null || echo "0")
+              UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l 2>/dev/null || echo "0")
+              THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l 2>/dev/null || echo "0")
+
+              echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT
+              echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT
+              echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT
+
+              echo "📊 Evaluation Summary (from terminal output):"
+              echo "  - Success rate: $(echo "$SUCCESS_RATE * 100" | bc -l 2>/dev/null || echo "unknown")%"
+              echo "  - Dataset rows evaluated: $NUM_ROWS"
+              echo "  - Number of runs: $NUM_RUNS"
+              echo "  - Lower bound (≥40%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
+              echo "  - Upper bound (≤90%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
+              echo "  - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
+            else
+              echo "❌ No EP Summary line found in terminal output"
+              echo "threshold_met=0" >> $GITHUB_OUTPUT
+              echo "success_rate=0" >> $GITHUB_OUTPUT
             fi
-          done
-
-          if [ -n "$EP_SUMMARY_FILE" ] && [ -f "$EP_SUMMARY_FILE" ]; then
-            echo "EP Summary found: $EP_SUMMARY_FILE, parsing..."
-
-            # Log the full summary for debugging
-            echo "EP Summary contents:"
-            cat "$EP_SUMMARY_FILE" | jq . 2>/dev/null || cat "$EP_SUMMARY_FILE"
-
-            # Extract success rate from EP summary (this contains the actual accuracy/success rate)
-            # The EP summary uses 'agg_score' for the aggregated success rate
-            SUCCESS_RATE=$(jq -r '.agg_score // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0")
-
-            echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT
-
-            # Check if success rate meets thresholds (40% - 90% acceptable range)
-            LOWER_BOUND=0.4  # 40%
-            UPPER_BOUND=0.9  # 90%
-            LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l)
-            UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l)
-            THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l)
-
-            echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT
-            echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT
-            echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT
-
-            # Extract additional info for display
-            NUM_ROWS=$(jq -r '.rows // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0")
-            NUM_RUNS=$(jq -r '.num_runs // 0' "$EP_SUMMARY_FILE" 2>/dev/null || echo "0")
-
-            echo "📊 Evaluation Summary:"
-            echo "  - Success rate (agg_score): $(echo "$SUCCESS_RATE * 100" | bc -l)%"
-            echo "  - Dataset rows evaluated: $NUM_ROWS"
-            echo "  - Number of runs: $NUM_RUNS"
-            echo "  - Lower bound (≥40%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
-            echo "  - Upper bound (≤90%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
-            echo "  - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
           else
-            echo "❌ No EP summary file found"
+            echo "❌ No terminal output file found"
             echo "threshold_met=0" >> $GITHUB_OUTPUT
             echo "success_rate=0" >> $GITHUB_OUTPUT
           fi
@@ -133,6 +136,7 @@ jobs:
         with:
           name: e2e-smoke-test-results-${{ github.run_number }}
           path: |
+            test_output.log
             ep_summary*.json
             *.log
           retention-days: 7
@@ -149,16 +153,16 @@ jobs:
           SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}"
 
           echo "Test exit code: $TEST_EXIT_CODE"
-          echo "Threshold met (40%-90%): $THRESHOLD_MET"
+          echo "Threshold met (40%-60%): $THRESHOLD_MET"
           echo "Lower bound met (≥40%): $LOWER_BOUND_MET"
-          echo "Upper bound met (≤90%): $UPPER_BOUND_MET"
+          echo "Upper bound met (≤60%): $UPPER_BOUND_MET"
           echo "Success rate: $SUCCESS_RATE"
 
           # Fail the job if tests didn't run successfully or thresholds weren't met
           if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then
             echo "❌ E2E smoke test FAILED"
             echo "   - Test execution failed (exit code: $TEST_EXIT_CODE)"
-            echo "   - Success rate outside acceptable range (required: 40%-90%, actual: ${SUCCESS_RATE:-unknown})"
+            echo "   - Success rate outside acceptable range (required: 40%-60%, actual: ${SUCCESS_RATE:-unknown})"
             exit 1
           elif [ "$TEST_EXIT_CODE" != "0" ]; then
             echo "⚠️ E2E smoke test had test execution issues but may have met thresholds"
@@ -179,16 +183,16 @@ jobs:
             elif [ "$UPPER_BOUND_MET" != "1" ]; then
               echo "❌ E2E smoke test FAILED - success rate suspiciously high"
               echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
-              echo "   - Maximum expected: ≤90%"
+              echo "   - Maximum expected: ≤60%"
               echo "   - This may indicate test issues or unrealistic performance"
             else
               echo "❌ E2E smoke test FAILED - success rate outside acceptable range"
               echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
-              echo "   - Required range: 40%-90%"
+              echo "   - Required range: 40%-60%"
             fi
             exit 1
           else
             echo "✅ E2E smoke test PASSED"
             echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
-            echo "   - Within acceptable range: 40%-90%"
+            echo "   - Within acceptable range: 40%-60%"
           fi

From d7e51f27146fbd443c7e233047bf58b0bd862bb2 Mon Sep 17 00:00:00 2001
From: Derek Xu <derek@fireworks.ai>
Date: Wed, 13 Aug 2025 20:05:27 +0000
Subject: [PATCH 5/7] adjust bounds

---
 .github/workflows/e2e-smoke-test.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml
index 7521b3cb..a20bbc27 100644
--- a/.github/workflows/e2e-smoke-test.yml
+++ b/.github/workflows/e2e-smoke-test.yml
@@ -101,9 +101,9 @@ jobs:
 
               echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT
 
-              # Check if success rate meets thresholds (40% - 90% acceptable range)
-              LOWER_BOUND=0.4  # 40%
-              UPPER_BOUND=0.9  # 90%
+              # Check if success rate meets thresholds (36% - 60% acceptable range)
+              LOWER_BOUND=0.36  # 36%
+              UPPER_BOUND=0.6  # 60%
               LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l 2>/dev/null || echo "0")
               UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l 2>/dev/null || echo "0")
               THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l 2>/dev/null || echo "0")
@@ -116,8 +116,8 @@ jobs:
               echo "  - Success rate: $(echo "$SUCCESS_RATE * 100" | bc -l 2>/dev/null || echo "unknown")%"
               echo "  - Dataset rows evaluated: $NUM_ROWS"
               echo "  - Number of runs: $NUM_RUNS"
-              echo "  - Lower bound (≥40%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
-              echo "  - Upper bound (≤90%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
+              echo "  - Lower bound (≥36%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
+              echo "  - Upper bound (≤60%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
               echo "  - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
             else
               echo "❌ No EP Summary line found in terminal output"

From df26606705a986d05a808f0224ec6f69c9806443 Mon Sep 17 00:00:00 2001
From: Derek Xu <derek@fireworks.ai>
Date: Wed, 13 Aug 2025 20:09:07 +0000
Subject: [PATCH 6/7] change back to regular schedule

---
 .github/workflows/e2e-smoke-test.yml | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml
index a20bbc27..ec91875c 100644
--- a/.github/workflows/e2e-smoke-test.yml
+++ b/.github/workflows/e2e-smoke-test.yml
@@ -2,16 +2,6 @@ name: E2E Smoke Test
 
 # Run every 6 hours: at 00:00, 06:00, 12:00, and 18:00 UTC
 on:
-  push:
-    branches: [main]
-    paths-ignore:
-      - "docs/**"
-      - "*.md"
-  pull_request:
-    branches: [main]
-    paths-ignore:
-      - "docs/**"
-      - "*.md"
   schedule:
     - cron: '0 */6 * * *'
   workflow_dispatch: # Allow manual triggering

From 1505018198cd39d854824ce29123e286ef54d1c3 Mon Sep 17 00:00:00 2001
From: Derek Xu <derek@fireworks.ai>
Date: Wed, 13 Aug 2025 20:15:36 +0000
Subject: [PATCH 7/7] final

---
 tests/test_tau_bench_airline_smoke.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tau_bench_airline_smoke.py b/tests/test_tau_bench_airline_smoke.py
index e0eae2ef..e96baabe 100644
--- a/tests/test_tau_bench_airline_smoke.py
+++ b/tests/test_tau_bench_airline_smoke.py
@@ -68,7 +68,7 @@ def tau_bench_airline_smoke_to_evaluation_row(data: List[Dict[str, Any]]) -> Lis
     model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
     rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}],
     rollout_processor=default_mcp_gym_rollout_processor,
-    passed_threshold=0.4,
+    passed_threshold=0.36,
     num_runs=1,  # Smoke test: single run for quick feedback
     mode="pointwise",
     max_concurrent_rollouts=50,  # Standard concurrency