eval-protocol
diff --git a/‎eval_protocol/rewards/ifeval/README.md‎ ‎eval_protocol/benchmarks/ifeval/README.md‎eval_protocol/rewards/ifeval/README.md renamed to eval_protocol/benchmarks/ifeval/README.md
Lines changed: 15 additions & 13 deletions b/‎eval_protocol/rewards/ifeval/README.md‎ ‎eval_protocol/benchmarks/ifeval/README.md‎eval_protocol/rewards/ifeval/README.md renamed to eval_protocol/benchmarks/ifeval/README.md
Lines changed: 15 additions & 13 deletions
diff --git a/‎eval_protocol/benchmarks/ifeval/__init__.py‎
Lines changed: 11 additions & 0 deletions b/‎eval_protocol/benchmarks/ifeval/__init__.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎eval_protocol/benchmarks/ifeval/data/ifbench_test_sample.jsonl‎
Lines changed: 50 additions & 0 deletions b/‎eval_protocol/benchmarks/ifeval/data/ifbench_test_sample.jsonl‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎…l/rewards/ifeval/ifbench_instructions.py‎ ‎…enchmarks/ifeval/ifbench_instructions.py‎eval_protocol/rewards/ifeval/ifbench_instructions.py renamed to eval_protocol/benchmarks/ifeval/ifbench_instructions.py b/‎…l/rewards/ifeval/ifbench_instructions.py‎ ‎…enchmarks/ifeval/ifbench_instructions.py‎eval_protocol/rewards/ifeval/ifbench_instructions.py renamed to eval_protocol/benchmarks/ifeval/ifbench_instructions.py
diff --git a/‎…tocol/rewards/ifeval/ifbench_registry.py‎ ‎…ol/benchmarks/ifeval/ifbench_registry.py‎eval_protocol/rewards/ifeval/ifbench_registry.py renamed to eval_protocol/benchmarks/ifeval/ifbench_registry.py b/‎…tocol/rewards/ifeval/ifbench_registry.py‎ ‎…ol/benchmarks/ifeval/ifbench_registry.py‎eval_protocol/rewards/ifeval/ifbench_registry.py renamed to eval_protocol/benchmarks/ifeval/ifbench_registry.py
diff --git a/‎…_protocol/rewards/ifeval/ifbench_util.py‎ ‎…otocol/benchmarks/ifeval/ifbench_util.py‎eval_protocol/rewards/ifeval/ifbench_util.py renamed to eval_protocol/benchmarks/ifeval/ifbench_util.py b/‎…_protocol/rewards/ifeval/ifbench_util.py‎ ‎…otocol/benchmarks/ifeval/ifbench_util.py‎eval_protocol/rewards/ifeval/ifbench_util.py renamed to eval_protocol/benchmarks/ifeval/ifbench_util.py
diff --git a/‎…ol/rewards/ifeval/ifeval_instructions.py‎ ‎…benchmarks/ifeval/ifeval_instructions.py‎eval_protocol/rewards/ifeval/ifeval_instructions.py renamed to eval_protocol/benchmarks/ifeval/ifeval_instructions.py b/‎…ol/rewards/ifeval/ifeval_instructions.py‎ ‎…benchmarks/ifeval/ifeval_instructions.py‎eval_protocol/rewards/ifeval/ifeval_instructions.py renamed to eval_protocol/benchmarks/ifeval/ifeval_instructions.py
diff --git a/‎…otocol/rewards/ifeval/ifeval_registry.py‎ ‎…col/benchmarks/ifeval/ifeval_registry.py‎eval_protocol/rewards/ifeval/ifeval_registry.py renamed to eval_protocol/benchmarks/ifeval/ifeval_registry.py b/‎…otocol/rewards/ifeval/ifeval_registry.py‎ ‎…col/benchmarks/ifeval/ifeval_registry.py‎eval_protocol/rewards/ifeval/ifeval_registry.py renamed to eval_protocol/benchmarks/ifeval/ifeval_registry.py
diff --git a/‎…l_protocol/rewards/ifeval/ifeval_util.py‎ ‎…rotocol/benchmarks/ifeval/ifeval_util.py‎eval_protocol/rewards/ifeval/ifeval_util.py renamed to eval_protocol/benchmarks/ifeval/ifeval_util.py b/‎…l_protocol/rewards/ifeval/ifeval_util.py‎ ‎…rotocol/benchmarks/ifeval/ifeval_util.py‎eval_protocol/rewards/ifeval/ifeval_util.py renamed to eval_protocol/benchmarks/ifeval/ifeval_util.py
diff --git a/‎eval_protocol/rewards/ifeval/reward.py‎ ‎eval_protocol/benchmarks/ifeval/reward.py‎eval_protocol/rewards/ifeval/reward.py renamed to eval_protocol/benchmarks/ifeval/reward.py b/‎eval_protocol/rewards/ifeval/reward.py‎ ‎eval_protocol/benchmarks/ifeval/reward.py‎eval_protocol/rewards/ifeval/reward.py renamed to eval_protocol/benchmarks/ifeval/reward.py
@@ -1,13 +1,19 @@
-# IFEval Reward Function
+# IFEval Benchmark
 
 Evaluates how well model responses follow instruction constraints. Returns a partial credit score (0.0 to 1.0).
 
-## Quick Start
+## Usage
+
+### As eval-protocol benchmark test
+
+```bash
+pytest eval_protocol/benchmarks/ifeval/test_ifeval.py -v
+```
+
+### Standalone scoring function
 
 ```python
-import sys
-sys.path.insert(0, '/path/to/eval_protocol/rewards/ifeval')
-from reward import ifeval_partial_credit_reward
+from eval_protocol.benchmarks.ifeval import ifeval_partial_credit_reward
 
 response = "Hello world! This is my response."
 ground_truth = {
@@ -36,15 +42,11 @@ NLTK resources are downloaded automatically on first use.
 ## File Sources
 
 **Copied from `open-instruct/open_instruct/IFEvalG/`:**
-- `ifeval_instructions.py` (from `instructions.py`)
-- `ifeval_registry.py` (from `instructions_registry.py`)
-- `ifeval_util.py` (from `instructions_util.py`)
+- `ifeval_instructions.py`, `ifeval_registry.py`, `ifeval_util.py`
 
 **Copied from `IFBench/` (commit 8e6a9be, 2025-01):**
-- `ifbench_instructions.py` (from `instructions.py`)
-- `ifbench_registry.py` (from `instructions_registry.py`)
-- `ifbench_util.py` (from `instructions_util.py`)
+- `ifbench_instructions.py`, `ifbench_registry.py`, `ifbench_util.py`
 
 **New code:**
-- `reward.py` - main reward function
-- `__init__.py` - package exports
+- `reward.py` - scoring function
+- `test_ifeval.py` - eval-protocol benchmark test
@@ -0,0 +1,11 @@
+"""IFEval benchmark for evaluating instruction-following capabilities.
+
+Usage:
+    from eval_protocol.benchmarks.ifeval import ifeval_partial_credit_reward
+
+    score = ifeval_partial_credit_reward(response, ground_truth)
+"""
+
+from .reward import ifeval_partial_credit_reward
+
+__all__ = ["ifeval_partial_credit_reward"]