add more fields

Dylan Huang · Dylan Huang · commit 332f25baa7d1 · 2025-08-05T18:05:36.000-07:00
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -209,6 +209,9 @@ class EvalMetadata(BaseModel):
         ..., description="Version of the evaluation. By default, we will populate this with the current commit hash."
     )
     status: Literal["running", "finished", "error"] = Field("running", description="Status of the evaluation")
+    num_runs: int = Field(..., description="Number of times the evaluation was repeated")
+    aggregation_method: str = Field(..., description="Method used to aggregate scores across runs")
+    threshold_of_success: Optional[float] = Field(None, description="Threshold score for test success")
 
 
 class EvaluationRow(BaseModel):
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -6,7 +6,7 @@
 # Import versioneer for getting version information
 import versioneer
 from eval_protocol.dataset_logger import default_logger
-from eval_protocol.models import EvalMetadata, EvaluationRow
+from eval_protocol.models import CompletionParams, EvalMetadata, EvaluationRow, InputMetadata
 from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
 from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
 from eval_protocol.pytest.types import (
@@ -207,16 +207,34 @@ def wrapper_body(**kwargs):
                     raise ValueError("No input dataset or input messages provided")
 
                 input_dataset: List[EvaluationRow] = []
+                input_params = kwargs.get("input_params") or {}
                 config = RolloutProcessorConfig(
                     model=model_name,
-                    input_params=kwargs.get("input_params") or {},
+                    input_params=input_params,
                     mcp_config_path=mcp_config_path or "",
                     max_concurrent_rollouts=max_concurrent_rollouts,
                     server_script_path=server_script_path,
                     steps=steps,
                 )
                 input_dataset = execute_function(rollout_processor, rows=data, config=config)
 
+                # Populate completion_params in input_metadata for all rows
+                completion_params = CompletionParams(
+                    model=model_name,
+                    temperature=input_params.get("temperature"),
+                    max_tokens=input_params.get("max_tokens"),
+                    max_tool_calls=input_params.get("max_tool_calls"),
+                )
+
+                for row in input_dataset:
+                    if row.input_metadata is None:
+                        row.input_metadata = InputMetadata()
+                    row.input_metadata.completion_params = completion_params
+                    # Add mode to session_data
+                    if row.input_metadata.session_data is None:
+                        row.input_metadata.session_data = {}
+                    row.input_metadata.session_data["mode"] = mode
+
                 all_results: List[EvaluationRow] = []
                 for _ in range(num_runs):
                     if mode == "pointwise":
@@ -263,6 +281,9 @@ def wrapper_body(**kwargs):
                     description=test_func.__doc__,
                     version=versioneer.get_version(),
                     status="finished",
+                    num_runs=num_runs,
+                    aggregation_method=aggregation_method,
+                    threshold_of_success=threshold_of_success,
                 )
 
                 # Add metadata to all results before logging
diff --git a/vite-app/src/types/eval-protocol.ts b/vite-app/src/types/eval-protocol.ts
@@ -78,7 +78,10 @@ export const EvalMetadataSchema = z.object({
   name: z.string().describe('Name of the evaluation'),
   description: z.string().optional().describe('Description of the evaluation'),
   version: z.string().describe('Version of the evaluation. By default, we will populate this with the current commit hash.'),
-  status: z.enum(['running', 'finished', 'error']).default('running').describe('Status of the evaluation')
+  status: z.enum(['running', 'finished', 'error']).default('running').describe('Status of the evaluation'),
+  num_runs: z.number().int().describe('Number of times the evaluation was repeated'),
+  aggregation_method: z.string().describe('Method used to aggregate scores across runs'),
+  threshold_of_success: z.number().optional().describe('Threshold score for test success')
 });
 
 export const EvaluationRowSchema = z.object({

Original file line number	Diff line number	Diff line change
`@@ -209,6 +209,9 @@ class EvalMetadata(BaseModel):`
`209`	`209`	`..., description="Version of the evaluation. By default, we will populate this with the current commit hash."`
`210`	`210`	`)`
`211`	`211`	`status: Literal["running", "finished", "error"] = Field("running", description="Status of the evaluation")`
	`212`	`+ num_runs: int = Field(..., description="Number of times the evaluation was repeated")`
	`213`	`+ aggregation_method: str = Field(..., description="Method used to aggregate scores across runs")`
	`214`	`+ threshold_of_success: Optional[float] = Field(None, description="Threshold score for test success")`
`212`	`215`
`213`	`216`
`214`	`217`	`class EvaluationRow(BaseModel):`