eval-protocol · mayinghan · Aug 13, 2025 · Aug 13, 2025 · Aug 13, 2025 · Aug 13, 2025
diff --git a/.flake8 b/.flake8
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -42,11 +42,14 @@ jobs:
       - name: Install tau2 for testing
         run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
 
-      - name: Lint with flake8
-        run: uv run flake8 eval_protocol tests examples scripts --count --exit-zero --max-complexity=10 --max-line-length=88 --statistics
+      - name: Ruff format (check)
+        run: uv run ruff format --check .
 
-      - name: Type check with mypy
-        run: uv run mypy eval_protocol
+      - name: Ruff lint
+        run: uv run ruff check .
+
+      - name: Type check with pyright
+        run: uv run pyright
 
   test-core:
     name: Core Tests (Python ${{ matrix.python-version }})
@@ -92,6 +95,10 @@ jobs:
             --ignore=tests/pytest/test_frozen_lake.py \
             --ignore=tests/pytest/test_lunar_lander.py \
             --ignore=tests/pytest/test_tau_bench_airline.py \
+            --ignore=tests/pytest/test_apps_coding.py \
+            --ignore=tests/test_tau_bench_airline_smoke.py \
+            --ignore=tests/pytest/test_svgbench.py \
+            --ignore=tests/pytest/test_livesvgbench.py \
             --cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10
 
       - name: Store coverage file

diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml
@@ -0,0 +1,188 @@
+name: E2E Smoke Test
+
+# Run every 6 hours: at 00:00, 06:00, 12:00, and 18:00 UTC
+on:
+  schedule:
+    - cron: '0 */6 * * *'
+  workflow_dispatch: # Allow manual triggering
+    inputs:
+      debug_mode:
+        description: 'Enable debug output'
+        required: false
+        default: 'false'
+        type: boolean
+
+jobs:
+  e2e-smoke-test:
+    name: E2E Smoke Test
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+
+      - name: Install the project
+        run: uv sync --locked --all-extras --dev
+
+      - name: Install tau2 for testing
+        run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
+
+      - name: Run E2E Smoke Test
+        id: run_test
+        env:
+          FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
+          FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
+        run: |
+          echo "Running e2e smoke test..."
+
+          # Run the test and capture both stdout and exit code
+          set +e  # Don't exit on failure
+
+          uv run pytest tests/test_tau_bench_airline_smoke.py::test_tau_bench_airline_smoke_evaluation \
+            -v --tb=short --durations=10 \
+            --ep-print-summary \
+            --ep-summary-json=ep_summary.json 2>&1 | tee test_output.log
+
+          TEST_EXIT_CODE=$?
+
+          echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT
+
+          # List generated files for debugging
+          echo "📁 Generated files:"
+          ls -la *.json 2>/dev/null || echo "No JSON files found"
+          ls -la ep_summary* 2>/dev/null || echo "No ep_summary files found"
+
+          # Parse EP summary from terminal output (more reliable than JSON files)
+          if [ -f test_output.log ]; then
+            echo "📋 Parsing EP summary from terminal output..."
+
+            # Show the terminal output for debugging
+            echo "Terminal output:"
+            cat test_output.log
+            echo ""
+
+            # Extract the EP Summary line from the terminal output
+            EP_SUMMARY_LINE=$(grep "EP Summary |" test_output.log 2>/dev/null || echo "")
+
+            if [ -n "$EP_SUMMARY_LINE" ]; then
+              echo "Found EP Summary line:"
+              echo "$EP_SUMMARY_LINE"
+
+              # Parse the agg score from the line: "EP Summary | ... agg=0.420 ..."
+              SUCCESS_RATE=$(echo "$EP_SUMMARY_LINE" | grep -o "agg=[0-9.]*" | cut -d= -f2 2>/dev/null || echo "0")
+
+              # Extract other info
+              NUM_RUNS=$(echo "$EP_SUMMARY_LINE" | grep -o "runs=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0")
+              NUM_ROWS=$(echo "$EP_SUMMARY_LINE" | grep -o "rows=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0")
+
+              echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT
+
+              # Check if success rate meets thresholds (36% - 60% acceptable range)
+              LOWER_BOUND=0.36  # 36%
+              UPPER_BOUND=0.6  # 60%
+              LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l 2>/dev/null || echo "0")
+              UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l 2>/dev/null || echo "0")
+              THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l 2>/dev/null || echo "0")
+
+              echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT
+              echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT
+              echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT
+
+              echo "📊 Evaluation Summary (from terminal output):"
+              echo "  - Success rate: $(echo "$SUCCESS_RATE * 100" | bc -l 2>/dev/null || echo "unknown")%"
+              echo "  - Dataset rows evaluated: $NUM_ROWS"
+              echo "  - Number of runs: $NUM_RUNS"
+              echo "  - Lower bound (≥36%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
+              echo "  - Upper bound (≤60%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
+              echo "  - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")"
+            else
+              echo "❌ No EP Summary line found in terminal output"
+              echo "threshold_met=0" >> $GITHUB_OUTPUT
+              echo "success_rate=0" >> $GITHUB_OUTPUT
+            fi
+          else
+            echo "❌ No terminal output file found"
+            echo "threshold_met=0" >> $GITHUB_OUTPUT
+            echo "success_rate=0" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: e2e-smoke-test-results-${{ github.run_number }}
+          path: |
+            test_output.log
+            ep_summary*.json
+            *.log
+          retention-days: 7
+
+      - name: Validate test results
+        if: always()
+        run: |
+          echo "Validating test results against thresholds..."
+
+          TEST_EXIT_CODE="${{ steps.run_test.outputs.test_exit_code }}"
+          THRESHOLD_MET="${{ steps.run_test.outputs.threshold_met }}"
+          LOWER_BOUND_MET="${{ steps.run_test.outputs.lower_bound_met }}"
+          UPPER_BOUND_MET="${{ steps.run_test.outputs.upper_bound_met }}"
+          SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}"
+
+          echo "Test exit code: $TEST_EXIT_CODE"
+          echo "Threshold met (40%-60%): $THRESHOLD_MET"
+          echo "Lower bound met (≥40%): $LOWER_BOUND_MET"
+          echo "Upper bound met (≤60%): $UPPER_BOUND_MET"
+          echo "Success rate: $SUCCESS_RATE"
+
+          # Fail the job if tests didn't run successfully or thresholds weren't met
+          if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then
+            echo "❌ E2E smoke test FAILED"
+            echo "   - Test execution failed (exit code: $TEST_EXIT_CODE)"
+            echo "   - Success rate outside acceptable range (required: 40%-60%, actual: ${SUCCESS_RATE:-unknown})"
+            exit 1
+          elif [ "$TEST_EXIT_CODE" != "0" ]; then
+            echo "⚠️ E2E smoke test had test execution issues but may have met thresholds"
+            echo "   - Test exit code: $TEST_EXIT_CODE"
+            echo "   - Thresholds met: $THRESHOLD_MET"
+            # Don't exit with error if thresholds were actually met despite test issues
+            if [ "$THRESHOLD_MET" = "1" ]; then
+              echo "✅ Thresholds met despite execution issues - considering this a pass"
+            else
+              exit 1
+            fi
+          elif [ "$THRESHOLD_MET" != "1" ]; then
+            # Determine which bound was violated
+            if [ "$LOWER_BOUND_MET" != "1" ]; then
+              echo "❌ E2E smoke test FAILED - success rate too low"
+              echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
+              echo "   - Required: ≥40%"
+            elif [ "$UPPER_BOUND_MET" != "1" ]; then
+              echo "❌ E2E smoke test FAILED - success rate suspiciously high"
+              echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
+              echo "   - Maximum expected: ≤60%"
+              echo "   - This may indicate test issues or unrealistic performance"
+            else
+              echo "❌ E2E smoke test FAILED - success rate outside acceptable range"
+              echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
+              echo "   - Required range: 40%-60%"
+            fi
+            exit 1
+          else
+            echo "✅ E2E smoke test PASSED"
+            echo "   - Success rate: ${SUCCESS_RATE:-unknown}"
+            echo "   - Within acceptable range: 40%-60%"
+          fi
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,43 +1,29 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
+exclude: |
+  (^vite-app/|\.snap$)
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
+    rev: v6.0.0
     hooks:
     -   id: trailing-whitespace
+        exclude: "(^vite-app/|\\.snap$)"
     -   id: end-of-file-fixer
+        exclude: "(^vite-app/|\\.snap$)"
     -   id: check-yaml
     -   id: check-added-large-files
     -   id: check-merge-conflict
     -   id: check-toml
     -   id: detect-private-key
 
--   repo: https://github.com/psf/black
-    rev: 25.1.0
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.12.8
     hooks:
-    -   id: black
-        args: [--line-length=119]
+    -   id: ruff-format
+    -   id: ruff
+        args: ["--fix"]
 
--   repo: https://github.com/pycqa/isort
-    rev: 6.0.1
+-   repo: https://github.com/RobertCraigie/pyright-python
+    rev: v1.1.403
     hooks:
-    -   id: isort
-        name: isort (python)
-        args: ["--profile", "black", "--line-length", "119", "--filter-files"]
-
--   repo: https://github.com/pycqa/flake8
-    rev: 7.3.0
-    hooks:
-    -   id: flake8
-        args: [--max-line-length=119, --max-complexity=100, "--ignore=E402,F401,F541,W503,E203,F811,E226,F841,E704,E713,E712,E231,E731,E501"]
-        # additional_dependencies: [flake8-docstrings, flake8-import-order] # Optional: add flake8 plugins
-
--   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.17.0
-    hooks:
-    -   id: mypy
-        args: [--ignore-missing-imports, --install-types, --non-interactive]
-        additional_dependencies:
-        - types-requests
-        - types-setuptools
-        # Add other types-* packages your project uses
+    -   id: pyright
diff --git a/LICENSE b/LICENSE
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+SOFTWARE.
diff --git a/development/normalize_sandbox_fusion.py b/development/normalize_sandbox_fusion.py
@@ -56,7 +56,7 @@
 try:
     repobench_p_tokenizer = AutoTokenizer.from_pretrained("gpt2")
 except OSError:
-    print("Warning: Could not load gpt2 tokenizer for Repobench-P. " "Falling back to basic split for token counting.")
+    print("Warning: Could not load gpt2 tokenizer for Repobench-P. Falling back to basic split for token counting.")
     repobench_p_tokenizer = None
 
 
@@ -108,8 +108,7 @@ def format_aider_prompt(problem_json: dict) -> str:
     """Format the prompt for Aider benchmark style problems."""
     question = problem_json.get("content", "")
     return (
-        f"{question}\n\nPlease generate the code in the following format:\n"
-        "```python\n# Your code response here\n```"
+        f"{question}\n\nPlease generate the code in the following format:\n```python\n# Your code response here\n```"
     )
 
 
@@ -327,7 +326,7 @@ def normalize_problem_to_openai_format(
             try:
                 labels = json.loads(labels_data)
             except json.JSONDecodeError:
-                print(f"Warning: Skipping ID {problem_id_str} in {filename} " "- malformed JSON in labels.")
+                print(f"Warning: Skipping ID {problem_id_str} in {filename} - malformed JSON in labels.")
                 return None
         elif isinstance(labels_data, dict):
             labels = labels_data
@@ -426,10 +425,10 @@ def normalize_problem_to_openai_format(
             )
             return None
         if not final_user_content.strip() or not final_assistant_content.strip():
-            print(f"Warning: Skipping ID {problem_id_str} in {filename} - " "empty processed content.")
+            print(f"Warning: Skipping ID {problem_id_str} in {filename} - empty processed content.")
             return None
         if final_assistant_content.strip() == "import sys; sys.exit(0)":
-            print(f"Warning: Skipping ID {problem_id_str} in {filename} - " "placeholder solution.")
+            print(f"Warning: Skipping ID {problem_id_str} in {filename} - placeholder solution.")
             return None
 
         return {
@@ -439,7 +438,7 @@ def normalize_problem_to_openai_format(
             ]
         }
     except Exception as e:
-        print(f"Warning: Skipping ID {problem_id_str} in {filename} - " f"error ({type(e).__name__}: {e}).")
+        print(f"Warning: Skipping ID {problem_id_str} in {filename} - error ({type(e).__name__}: {e}).")
         import traceback
 
         traceback.print_exc()
@@ -474,7 +473,7 @@ def main():
                 file_error_count += 1
                 continue
 
-            print(f"Processing file {filename_idx + 1}/{len(ALL_SOURCE_JSONL_FILES)}: " f"{filename}...")
+            print(f"Processing file {filename_idx + 1}/{len(ALL_SOURCE_JSONL_FILES)}: {filename}...")
             lines_in_file = 0
             processed_in_file = 0
             skipped_in_file = 0
@@ -488,7 +487,7 @@ def main():
                         try:
                             problem_data = json.loads(stripped_line)
                         except json.JSONDecodeError:
-                            print(f"Warning: Malformed JSON on line {line_number} " f"in {filepath}. Skipping line.")
+                            print(f"Warning: Malformed JSON on line {line_number} in {filepath}. Skipping line.")
                             skipped_in_file += 1
                             continue
 
@@ -507,7 +506,7 @@ def main():
                 processed_count += processed_in_file
                 skipped_count += skipped_in_file
             except Exception as e:
-                print(f"Error processing file {filepath}: {type(e).__name__}: {e}. " "Skipping rest of file.")
+                print(f"Error processing file {filepath}: {type(e).__name__}: {e}. Skipping rest of file.")
                 import traceback
 
                 traceback.print_exc()

diff --git a/development/notes/frozen_lake_context.md b/development/notes/frozen_lake_context.md