eval-protocol · benjibc · Aug 18, 2025 · Aug 17, 2025
diff --git a/.flake8 b/.flake8
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -42,11 +42,14 @@ jobs:
       - name: Install tau2 for testing
         run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
 
-      - name: Lint with flake8
-        run: uv run flake8 eval_protocol tests examples scripts --count --exit-zero --max-complexity=10 --max-line-length=88 --statistics
+      - name: Ruff format (check)
+        run: uv run ruff format --check .
 
-      - name: Type check with mypy
-        run: uv run mypy eval_protocol
+      - name: Ruff lint
+        run: uv run ruff check .
+
+      - name: Type check with pyright
+        run: uv run pyright
 
   test-core:
     name: Core Tests (Python ${{ matrix.python-version }})

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,43 +1,29 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
+exclude: |
+  (^vite-app/|\.snap$)
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
+    rev: v6.0.0
     hooks:
     -   id: trailing-whitespace
+        exclude: "(^vite-app/|\\.snap$)"
     -   id: end-of-file-fixer
+        exclude: "(^vite-app/|\\.snap$)"
     -   id: check-yaml
     -   id: check-added-large-files
     -   id: check-merge-conflict
     -   id: check-toml
     -   id: detect-private-key
 
--   repo: https://github.com/psf/black
-    rev: 25.1.0
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.12.8
     hooks:
-    -   id: black
-        args: [--line-length=119]
+    -   id: ruff-format
+    -   id: ruff
+        args: ["--fix"]
 
--   repo: https://github.com/pycqa/isort
-    rev: 6.0.1
+-   repo: https://github.com/RobertCraigie/pyright-python
+    rev: v1.1.403
     hooks:
-    -   id: isort
-        name: isort (python)
-        args: ["--profile", "black", "--line-length", "119", "--filter-files"]
-
--   repo: https://github.com/pycqa/flake8
-    rev: 7.3.0
-    hooks:
-    -   id: flake8
-        args: [--max-line-length=119, --max-complexity=100, "--ignore=E402,F401,F541,W503,E203,F811,E226,F841,E704,E713,E712,E231,E731,E501"]
-        # additional_dependencies: [flake8-docstrings, flake8-import-order] # Optional: add flake8 plugins
-
--   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.17.0
-    hooks:
-    -   id: mypy
-        args: [--ignore-missing-imports, --install-types, --non-interactive]
-        additional_dependencies:
-        - types-requests
-        - types-setuptools
-        # Add other types-* packages your project uses
+    -   id: pyright
diff --git a/LICENSE b/LICENSE
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+SOFTWARE.
diff --git a/development/normalize_sandbox_fusion.py b/development/normalize_sandbox_fusion.py
@@ -56,7 +56,7 @@
 try:
     repobench_p_tokenizer = AutoTokenizer.from_pretrained("gpt2")
 except OSError:
-    print("Warning: Could not load gpt2 tokenizer for Repobench-P. " "Falling back to basic split for token counting.")
+    print("Warning: Could not load gpt2 tokenizer for Repobench-P. Falling back to basic split for token counting.")
     repobench_p_tokenizer = None
 
 
@@ -108,8 +108,7 @@ def format_aider_prompt(problem_json: dict) -> str:
     """Format the prompt for Aider benchmark style problems."""
     question = problem_json.get("content", "")
     return (
-        f"{question}\n\nPlease generate the code in the following format:\n"
-        "```python\n# Your code response here\n```"
+        f"{question}\n\nPlease generate the code in the following format:\n```python\n# Your code response here\n```"
     )
 
 
@@ -327,7 +326,7 @@ def normalize_problem_to_openai_format(
             try:
                 labels = json.loads(labels_data)
             except json.JSONDecodeError:
-                print(f"Warning: Skipping ID {problem_id_str} in {filename} " "- malformed JSON in labels.")
+                print(f"Warning: Skipping ID {problem_id_str} in {filename} - malformed JSON in labels.")
                 return None
         elif isinstance(labels_data, dict):
             labels = labels_data
@@ -426,10 +425,10 @@ def normalize_problem_to_openai_format(
             )
             return None
         if not final_user_content.strip() or not final_assistant_content.strip():
-            print(f"Warning: Skipping ID {problem_id_str} in {filename} - " "empty processed content.")
+            print(f"Warning: Skipping ID {problem_id_str} in {filename} - empty processed content.")
             return None
         if final_assistant_content.strip() == "import sys; sys.exit(0)":
-            print(f"Warning: Skipping ID {problem_id_str} in {filename} - " "placeholder solution.")
+            print(f"Warning: Skipping ID {problem_id_str} in {filename} - placeholder solution.")
             return None
 
         return {
@@ -439,7 +438,7 @@ def normalize_problem_to_openai_format(
             ]
         }
     except Exception as e:
-        print(f"Warning: Skipping ID {problem_id_str} in {filename} - " f"error ({type(e).__name__}: {e}).")
+        print(f"Warning: Skipping ID {problem_id_str} in {filename} - error ({type(e).__name__}: {e}).")
         import traceback
 
         traceback.print_exc()
@@ -474,7 +473,7 @@ def main():
                 file_error_count += 1
                 continue
 
-            print(f"Processing file {filename_idx + 1}/{len(ALL_SOURCE_JSONL_FILES)}: " f"{filename}...")
+            print(f"Processing file {filename_idx + 1}/{len(ALL_SOURCE_JSONL_FILES)}: {filename}...")
             lines_in_file = 0
             processed_in_file = 0
             skipped_in_file = 0
@@ -488,7 +487,7 @@ def main():
                         try:
                             problem_data = json.loads(stripped_line)
                         except json.JSONDecodeError:
-                            print(f"Warning: Malformed JSON on line {line_number} " f"in {filepath}. Skipping line.")
+                            print(f"Warning: Malformed JSON on line {line_number} in {filepath}. Skipping line.")
                             skipped_in_file += 1
                             continue
 
@@ -507,7 +506,7 @@ def main():
                 processed_count += processed_in_file
                 skipped_count += skipped_in_file
             except Exception as e:
-                print(f"Error processing file {filepath}: {type(e).__name__}: {e}. " "Skipping rest of file.")
+                print(f"Error processing file {filepath}: {type(e).__name__}: {e}. Skipping rest of file.")
                 import traceback
 
                 traceback.print_exc()

diff --git a/development/notes/pytest_integration_proposal.md b/development/notes/pytest_integration_proposal.md
@@ -115,7 +115,7 @@ def tau2_rollout_processor(row: EvaluationRow, model: str, input_params: Dict, *
     #    from the dataset and provide a simulated tool response.
     # 4. Call the model again with the tool response.
     # 5. Construct a final EvaluationRow with the full transcript.
-    
+
     # The logic is encapsulated here, away from the test definition.
     processed_row = ep.default_rollout_processor(row, model, input_params)[0] # Simplified for example
     return [processed_row]
@@ -186,11 +186,11 @@ def best_of_n_processor(row: EvaluationRow, model: str, input_params: Dict, **kw
 
     # Then, apply a reward function to score each candidate.
     scored_rows = ep.evaluate(candidate_rows, score_politeness)
-    
+
     # Finally, select the best row.
     # This logic could be encapsulated in a helper, e.g., ep.select_best().
     best_row = select_best_by_group(scored_rows, score_key='politeness')
-    
+
     return [best_row]
 
 @evaluation_test(

diff --git a/development/utils/subprocess_manager.py b/development/utils/subprocess_manager.py
@@ -139,7 +139,7 @@ def start_ngrok_and_get_url(
         # Or by setting NGROK_AUTHTOKEN environment variable.
         # Forcing it via command line is also an option but less common for persistent setup.
         print(
-            f"Note: Ngrok authtoken should be pre-configured by the user (e.g., 'ngrok config add-authtoken <token>') or via NGROK_AUTHTOKEN env var."
+            "Note: Ngrok authtoken should be pre-configured by the user (e.g., 'ngrok config add-authtoken <token>') or via NGROK_AUTHTOKEN env var."
         )
         # Example if passing via env for the subprocess:
         # ngrok_env = os.environ.copy()