Skip to content

Commit 753ab8c

Browse files
committed
fix precommit
1 parent c35d7f0 commit 753ab8c

91 files changed

Lines changed: 1224 additions & 1809 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.flake8

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
[flake8]
2-
max-line-length = 119
3-
ignore = E203, W503
2+
max-line-length = 200
3+
ignore = E203, W503, E501, E402, F401, F541, F811, F841, E704, E713, E712, E231, E731, E226, W291, W293, W292, E302, W504
4+
exclude = vendor

.pre-commit-config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,15 @@ repos:
2929
rev: 7.3.0
3030
hooks:
3131
- id: flake8
32+
exclude: ^vendor/
3233
args: [--max-line-length=119, --max-complexity=100, "--ignore=E402,F401,F541,W503,E203,F811,E226,F841,E704,E713,E712,E231,E731,E501"]
3334
# additional_dependencies: [flake8-docstrings, flake8-import-order] # Optional: add flake8 plugins
3435

3536
- repo: https://github.com/pre-commit/mirrors-mypy
3637
rev: v1.17.0
3738
hooks:
3839
- id: mypy
40+
exclude: ^vendor/
3941
args: [--ignore-missing-imports, --install-types, --non-interactive]
4042
additional_dependencies:
4143
- types-requests

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1818
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1919
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2020
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21-
SOFTWARE.
21+
SOFTWARE.

Makefile

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,45 @@
11
PYTHON_DIRS = tests examples scripts eval_protocol
22

3-
.PHONY: clean build dist upload test lint typecheck format release sync-docs version tag-version show-version bump-major bump-minor bump-patch full-release quick-release
3+
# Prefer tools from local virtualenv if present
4+
VENV ?= .venv
5+
VENV_BIN := $(VENV)/bin
6+
PYTHON := $(if $(wildcard $(VENV_BIN)/python),$(VENV_BIN)/python,python)
7+
FLAKE8 := $(if $(wildcard $(VENV_BIN)/flake8),$(VENV_BIN)/flake8,flake8)
8+
MYPY := $(if $(wildcard $(VENV_BIN)/mypy),$(VENV_BIN)/mypy,mypy)
9+
BLACK := $(if $(wildcard $(VENV_BIN)/black),$(VENV_BIN)/black,black)
10+
PRE_COMMIT := $(if $(wildcard $(VENV_BIN)/pre-commit),$(VENV_BIN)/pre-commit,pre-commit)
11+
PYTEST := $(if $(wildcard $(VENV_BIN)/pytest),$(VENV_BIN)/pytest,pytest)
12+
TWINE := $(if $(wildcard $(VENV_BIN)/twine),$(VENV_BIN)/twine,twine)
13+
14+
.PHONY: clean build dist upload test lint typecheck format release sync-docs version tag-version show-version bump-major bump-minor bump-patch full-release quick-release pre-commit help
415

516
clean:
617
rm -rf build/ dist/ *.egg-info/
718

19+
# Run all pre-commit hooks (if installed)
820
pre-commit:
9-
pre-commit run --all-files
21+
$(PRE_COMMIT) run --all-files
1022

1123
build: clean
12-
python -m build
24+
$(PYTHON) -m build
1325

1426
dist: build
1527

1628
upload:
17-
twine upload dist/*
29+
$(TWINE) upload dist/*
1830

1931
test:
20-
pytest
32+
$(PYTEST)
2133

2234
lint:
23-
flake8 $(PYTHON_DIRS)
35+
$(PRE_COMMIT) run flake8 --all-files
2436

2537
typecheck:
26-
mypy $(PYTHON_DIRS)
38+
$(PRE_COMMIT) run mypy --all-files
2739

2840
format:
29-
black $(PYTHON_DIRS)
41+
$(PRE_COMMIT) run black --all-files && \
42+
$(PRE_COMMIT) run isort --all-files
3043

3144
validate-docs:
3245
@echo "Validating documentation links..."
@@ -140,9 +153,9 @@ help:
140153
@echo " dist - Alias for build"
141154
@echo " upload - Upload to PyPI (make sure to bump version first)"
142155
@echo " test - Run tests"
143-
@echo " lint - Run flake8 linter"
144-
@echo " typecheck - Run mypy type checker"
145-
@echo " format - Run black code formatter"
156+
@echo " lint - Run flake8 via pre-commit"
157+
@echo " typecheck - Run mypy via pre-commit"
158+
@echo " format - Run black + isort via pre-commit"
146159
@echo " validate-docs - Validate all documentation links in docs.json"
147160
@echo " sync-docs - Sync docs to ~/home/docs with links under 'evaluators'"
148161
@echo " release - Run lint, typecheck, test, build, then upload"

development/notes/pytest_integration_proposal.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def tau2_rollout_processor(row: EvaluationRow, model: str, input_params: Dict, *
115115
# from the dataset and provide a simulated tool response.
116116
# 4. Call the model again with the tool response.
117117
# 5. Construct a final EvaluationRow with the full transcript.
118-
118+
119119
# The logic is encapsulated here, away from the test definition.
120120
processed_row = ep.default_rollout_processor(row, model, input_params)[0] # Simplified for example
121121
return [processed_row]
@@ -186,11 +186,11 @@ def best_of_n_processor(row: EvaluationRow, model: str, input_params: Dict, **kw
186186

187187
# Then, apply a reward function to score each candidate.
188188
scored_rows = ep.evaluate(candidate_rows, score_politeness)
189-
189+
190190
# Finally, select the best row.
191191
# This logic could be encapsulated in a helper, e.g., ep.select_best().
192192
best_row = select_best_by_group(scored_rows, score_key='politeness')
193-
193+
194194
return [best_row]
195195

196196
@evaluation_test(

eval_protocol/adapters/CONTRIBUTING.md

Lines changed: 41 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -37,36 +37,36 @@ except ImportError:
3737

3838
class YourCustomAdapter:
3939
"""Adapter for integrating with Your Custom Data Source.
40-
40+
4141
This adapter loads data from Your Custom Data Source and converts it
4242
to EvaluationRow format for use in evaluation pipelines.
43-
43+
4444
Examples:
4545
Basic usage:
4646
>>> adapter = YourCustomAdapter(api_key="your_key")
4747
>>> rows = list(adapter.get_evaluation_rows(limit=10))
4848
"""
49-
49+
5050
def __init__(self, **config):
5151
"""Initialize the adapter with configuration."""
5252
if not DEPENDENCY_AVAILABLE:
5353
raise ImportError("your_external_library not installed")
54-
54+
5555
# Initialize your client/connection here
5656
self.client = your_external_library.Client(**config)
57-
57+
5858
def get_evaluation_rows(self, **kwargs) -> Iterator[EvaluationRow]:
5959
"""Main method to fetch and convert data to EvaluationRow format.
60-
60+
6161
Args:
6262
**kwargs: Adapter-specific parameters
63-
63+
6464
Yields:
6565
EvaluationRow: Converted evaluation rows
6666
"""
6767
# Implement your data fetching logic
6868
raw_data = self.client.fetch_data(**kwargs)
69-
69+
7070
for item in raw_data:
7171
try:
7272
eval_row = self._convert_to_evaluation_row(item)
@@ -75,51 +75,51 @@ class YourCustomAdapter:
7575
except Exception as e:
7676
logger.warning(f"Failed to convert item: {e}")
7777
continue
78-
78+
7979
def _convert_to_evaluation_row(self, raw_item: Any) -> Optional[EvaluationRow]:
8080
"""Convert a raw data item to EvaluationRow format.
81-
81+
8282
Args:
8383
raw_item: Raw data item from your source
84-
84+
8585
Returns:
8686
EvaluationRow or None if conversion fails
8787
"""
8888
# Extract messages from your data format
8989
messages = self._extract_messages(raw_item)
90-
90+
9191
# Extract metadata
9292
input_metadata = self._create_input_metadata(raw_item)
93-
93+
9494
# Extract ground truth if available
9595
ground_truth = self._extract_ground_truth(raw_item)
96-
96+
9797
# Extract tools if available (for tool calling scenarios)
9898
tools = self._extract_tools(raw_item)
99-
99+
100100
return EvaluationRow(
101101
messages=messages,
102102
tools=tools,
103103
input_metadata=input_metadata,
104104
ground_truth=ground_truth,
105105
)
106-
106+
107107
def _extract_messages(self, raw_item: Any) -> List[Message]:
108108
"""Extract conversation messages from raw data."""
109109
# Implement message extraction logic
110110
# Convert your data format to List[Message]
111111
pass
112-
112+
113113
def _create_input_metadata(self, raw_item: Any) -> InputMetadata:
114114
"""Create InputMetadata from raw data."""
115115
# Implement metadata extraction
116116
pass
117-
117+
118118
def _extract_ground_truth(self, raw_item: Any) -> Optional[str]:
119119
"""Extract ground truth if available."""
120120
# Implement ground truth extraction
121121
pass
122-
122+
123123
def _extract_tools(self, raw_item: Any) -> Optional[List[Dict[str, Any]]]:
124124
"""Extract tool definitions if available."""
125125
# Implement tool extraction for tool calling scenarios
@@ -149,7 +149,7 @@ message = Message(
149149
content="I'll help you with that calculation.",
150150
tool_calls=[{
151151
"id": "call_123",
152-
"type": "function",
152+
"type": "function",
153153
"function": {
154154
"name": "calculate",
155155
"arguments": '{"x": 5, "y": 3}'
@@ -185,7 +185,7 @@ input_metadata = InputMetadata(
185185
},
186186
session_data={
187187
"user_id": "user123",
188-
"session_id": "session456",
188+
"session_id": "session456",
189189
"timestamp": "2024-01-01T00:00:00Z",
190190
}
191191
)
@@ -259,7 +259,7 @@ def get_evaluation_rows(self, **kwargs) -> Iterator[EvaluationRow]:
259259
except Exception as e:
260260
logger.error(f"Failed to fetch data: {e}")
261261
return
262-
262+
263263
for item in data:
264264
try:
265265
row = self._convert_to_evaluation_row(item)
@@ -298,36 +298,36 @@ from eval_protocol.models import EvaluationRow
298298

299299
class TestYourCustomAdapter:
300300
"""Test suite for YourCustomAdapter."""
301-
301+
302302
def test_initialization(self):
303303
"""Test adapter initialization."""
304304
adapter = YourCustomAdapter(api_key="test_key")
305305
assert adapter.client is not None
306-
306+
307307
def test_get_evaluation_rows(self):
308308
"""Test conversion to EvaluationRow format."""
309309
adapter = YourCustomAdapter(api_key="test_key")
310-
310+
311311
# Mock the external API response
312312
with patch.object(adapter.client, 'fetch_data') as mock_fetch:
313313
mock_fetch.return_value = [
314314
# Mock data in your format
315315
{"id": "1", "question": "Test?", "answer": "Yes"}
316316
]
317-
317+
318318
rows = list(adapter.get_evaluation_rows(limit=1))
319-
319+
320320
assert len(rows) == 1
321321
assert isinstance(rows[0], EvaluationRow)
322322
assert len(rows[0].messages) > 0
323-
323+
324324
def test_error_handling(self):
325325
"""Test error handling."""
326326
adapter = YourCustomAdapter(api_key="test_key")
327-
327+
328328
with patch.object(adapter.client, 'fetch_data') as mock_fetch:
329329
mock_fetch.side_effect = Exception("API Error")
330-
330+
331331
rows = list(adapter.get_evaluation_rows())
332332
assert len(rows) == 0 # Should handle error gracefully
333333
```
@@ -341,18 +341,18 @@ For simple chat data:
341341
```python
342342
def _extract_messages(self, conversation: Dict) -> List[Message]:
343343
messages = []
344-
344+
345345
# Add system prompt if available
346346
if conversation.get('system_prompt'):
347347
messages.append(Message(role="system", content=conversation['system_prompt']))
348-
348+
349349
# Add conversation turns
350350
for turn in conversation['turns']:
351351
messages.append(Message(
352352
role=turn['role'],
353353
content=turn['content']
354354
))
355-
355+
356356
return messages
357357
```
358358

@@ -363,27 +363,27 @@ For tool calling scenarios:
363363
```python
364364
def _extract_messages(self, trace: Dict) -> List[Message]:
365365
messages = []
366-
366+
367367
for step in trace['steps']:
368368
if step['type'] == 'user_message':
369369
messages.append(Message(role="user", content=step['content']))
370-
370+
371371
elif step['type'] == 'assistant_message':
372372
message = Message(role="assistant", content=step.get('content'))
373-
373+
374374
# Add tool calls if present
375375
if step.get('tool_calls'):
376376
message.tool_calls = step['tool_calls']
377-
377+
378378
messages.append(message)
379-
379+
380380
elif step['type'] == 'tool_response':
381381
messages.append(Message(
382382
role="tool",
383383
content=step['content'],
384384
tool_call_id=step['tool_call_id']
385385
))
386-
386+
387387
return messages
388388
```
389389

@@ -515,10 +515,10 @@ Here are some potential adapters that would be valuable:
515515

516516
- **OpenAI Evals**: Load data from OpenAI's evals repository
517517
- **LLM Evaluation Datasets**: MMLU, HellaSwag, etc.
518-
- **Chat Platforms**: Discord, Slack conversation exports
518+
- **Chat Platforms**: Discord, Slack conversation exports
519519
- **Monitoring Tools**: Other observability platforms
520520
- **Custom APIs**: Company-specific data sources
521521
- **File Formats**: Parquet, Excel, database exports
522522
- **Research Datasets**: Academic benchmarks and competitions
523523

524-
We welcome contributions for any of these or other creative integrations!
524+
We welcome contributions for any of these or other creative integrations!

0 commit comments

Comments
 (0)