Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .flake8

This file was deleted.

11 changes: 7 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,14 @@ jobs:
- name: Install tau2 for testing
run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main

- name: Lint with flake8
run: uv run flake8 eval_protocol tests examples scripts --count --exit-zero --max-complexity=10 --max-line-length=88 --statistics
- name: Ruff format (check)
run: uv run ruff format --check .

- name: Type check with mypy
run: uv run mypy eval_protocol
- name: Ruff lint
run: uv run ruff check .

- name: Type check with pyright
run: uv run pyright

test-core:
name: Core Tests (Python ${{ matrix.python-version }})
Expand Down
40 changes: 13 additions & 27 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,43 +1,29 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
exclude: |
(^vite-app/|\.snap$)
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
rev: v6.0.0
hooks:
- id: trailing-whitespace
exclude: "(^vite-app/|\\.snap$)"
- id: end-of-file-fixer
exclude: "(^vite-app/|\\.snap$)"
- id: check-yaml
- id: check-added-large-files
- id: check-merge-conflict
- id: check-toml
- id: detect-private-key

- repo: https://github.com/psf/black
rev: 25.1.0
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.12.8
hooks:
- id: black
args: [--line-length=119]
- id: ruff-format
- id: ruff
args: ["--fix"]

- repo: https://github.com/pycqa/isort
rev: 6.0.1
- repo: https://github.com/RobertCraigie/pyright-python
rev: v1.1.403
hooks:
- id: isort
name: isort (python)
args: ["--profile", "black", "--line-length", "119", "--filter-files"]

- repo: https://github.com/pycqa/flake8
rev: 7.3.0
hooks:
- id: flake8
args: [--max-line-length=119, --max-complexity=100, "--ignore=E402,F401,F541,W503,E203,F811,E226,F841,E704,E713,E712,E231,E731,E501"]
# additional_dependencies: [flake8-docstrings, flake8-import-order] # Optional: add flake8 plugins

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.17.0
hooks:
- id: mypy
args: [--ignore-missing-imports, --install-types, --non-interactive]
additional_dependencies:
- types-requests
- types-setuptools
# Add other types-* packages your project uses
- id: pyright
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
SOFTWARE.
19 changes: 9 additions & 10 deletions development/normalize_sandbox_fusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
try:
repobench_p_tokenizer = AutoTokenizer.from_pretrained("gpt2")
except OSError:
print("Warning: Could not load gpt2 tokenizer for Repobench-P. " "Falling back to basic split for token counting.")
print("Warning: Could not load gpt2 tokenizer for Repobench-P. Falling back to basic split for token counting.")
repobench_p_tokenizer = None


Expand Down Expand Up @@ -108,8 +108,7 @@ def format_aider_prompt(problem_json: dict) -> str:
"""Format the prompt for Aider benchmark style problems."""
question = problem_json.get("content", "")
return (
f"{question}\n\nPlease generate the code in the following format:\n"
"```python\n# Your code response here\n```"
f"{question}\n\nPlease generate the code in the following format:\n```python\n# Your code response here\n```"
)


Expand Down Expand Up @@ -327,7 +326,7 @@ def normalize_problem_to_openai_format(
try:
labels = json.loads(labels_data)
except json.JSONDecodeError:
print(f"Warning: Skipping ID {problem_id_str} in {filename} " "- malformed JSON in labels.")
print(f"Warning: Skipping ID {problem_id_str} in {filename} - malformed JSON in labels.")
return None
elif isinstance(labels_data, dict):
labels = labels_data
Expand Down Expand Up @@ -426,10 +425,10 @@ def normalize_problem_to_openai_format(
)
return None
if not final_user_content.strip() or not final_assistant_content.strip():
print(f"Warning: Skipping ID {problem_id_str} in {filename} - " "empty processed content.")
print(f"Warning: Skipping ID {problem_id_str} in {filename} - empty processed content.")
return None
if final_assistant_content.strip() == "import sys; sys.exit(0)":
print(f"Warning: Skipping ID {problem_id_str} in {filename} - " "placeholder solution.")
print(f"Warning: Skipping ID {problem_id_str} in {filename} - placeholder solution.")
return None

return {
Expand All @@ -439,7 +438,7 @@ def normalize_problem_to_openai_format(
]
}
except Exception as e:
print(f"Warning: Skipping ID {problem_id_str} in {filename} - " f"error ({type(e).__name__}: {e}).")
print(f"Warning: Skipping ID {problem_id_str} in {filename} - error ({type(e).__name__}: {e}).")
import traceback

traceback.print_exc()
Expand Down Expand Up @@ -474,7 +473,7 @@ def main():
file_error_count += 1
continue

print(f"Processing file {filename_idx + 1}/{len(ALL_SOURCE_JSONL_FILES)}: " f"{filename}...")
print(f"Processing file {filename_idx + 1}/{len(ALL_SOURCE_JSONL_FILES)}: {filename}...")
lines_in_file = 0
processed_in_file = 0
skipped_in_file = 0
Expand All @@ -488,7 +487,7 @@ def main():
try:
problem_data = json.loads(stripped_line)
except json.JSONDecodeError:
print(f"Warning: Malformed JSON on line {line_number} " f"in {filepath}. Skipping line.")
print(f"Warning: Malformed JSON on line {line_number} in {filepath}. Skipping line.")
skipped_in_file += 1
continue

Expand All @@ -507,7 +506,7 @@ def main():
processed_count += processed_in_file
skipped_count += skipped_in_file
except Exception as e:
print(f"Error processing file {filepath}: {type(e).__name__}: {e}. " "Skipping rest of file.")
print(f"Error processing file {filepath}: {type(e).__name__}: {e}. Skipping rest of file.")
import traceback

traceback.print_exc()
Expand Down
6 changes: 3 additions & 3 deletions development/notes/pytest_integration_proposal.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def tau2_rollout_processor(row: EvaluationRow, model: str, input_params: Dict, *
# from the dataset and provide a simulated tool response.
# 4. Call the model again with the tool response.
# 5. Construct a final EvaluationRow with the full transcript.

# The logic is encapsulated here, away from the test definition.
processed_row = ep.default_rollout_processor(row, model, input_params)[0] # Simplified for example
return [processed_row]
Expand Down Expand Up @@ -186,11 +186,11 @@ def best_of_n_processor(row: EvaluationRow, model: str, input_params: Dict, **kw

# Then, apply a reward function to score each candidate.
scored_rows = ep.evaluate(candidate_rows, score_politeness)

# Finally, select the best row.
# This logic could be encapsulated in a helper, e.g., ep.select_best().
best_row = select_best_by_group(scored_rows, score_key='politeness')

return [best_row]

@evaluation_test(
Expand Down
2 changes: 1 addition & 1 deletion development/utils/subprocess_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def start_ngrok_and_get_url(
# Or by setting NGROK_AUTHTOKEN environment variable.
# Forcing it via command line is also an option but less common for persistent setup.
print(
f"Note: Ngrok authtoken should be pre-configured by the user (e.g., 'ngrok config add-authtoken <token>') or via NGROK_AUTHTOKEN env var."
"Note: Ngrok authtoken should be pre-configured by the user (e.g., 'ngrok config add-authtoken <token>') or via NGROK_AUTHTOKEN env var."
)
# Example if passing via env for the subprocess:
# ngrok_env = os.environ.copy()
Expand Down
Loading
Loading