fix

xzrderek · xzrderek · commit c276c07532c0 · 2025-11-10T00:26:27.000-08:00
diff --git a/eval_protocol/cli_commands/create_rft.py b/eval_protocol/cli_commands/create_rft.py
@@ -251,6 +251,37 @@ def _build_trimmed_dataset_id(evaluator_id: str) -> str:
     return f"{base}{suffix}"
 
 
+def _resolve_selected_test(
+    project_root: str,
+    evaluator_id: Optional[str],
+    selected_tests: Optional[list] = None,
+) -> tuple[Optional[str], Optional[str]]:
+    """
+    Resolve a single test's source file path and function name to use downstream.
+    Priority:
+      1) If selected_tests provided and length == 1, use it.
+      2) Else discover tests; if exactly one test, use it.
+      3) Else, if evaluator_id provided, match by normalized '<file-stem>-<func-name>'.
+    Returns: (file_path, func_name) or (None, None) if unresolved.
+    """
+    try:
+        tests = selected_tests if selected_tests is not None else _discover_tests(project_root)
+        if not tests:
+            return None, None
+        if len(tests) == 1:
+            return tests[0].file_path, tests[0].qualname.split(".")[-1]
+        if evaluator_id:
+            for t in tests:
+                func_name = t.qualname.split(".")[-1]
+                source_file_name = os.path.splitext(os.path.basename(t.file_path))[0]
+                candidate = _normalize_evaluator_id(f"{source_file_name}-{func_name}")
+                if candidate == evaluator_id:
+                    return t.file_path, func_name
+        return None, None
+    except Exception:
+        return None, None
+
+
 def _poll_evaluator_status(
     evaluator_resource_name: str, api_key: str, api_base: str, timeout_minutes: int = 10
 ) -> bool:
@@ -354,12 +385,15 @@ def create_rft_command(args) -> int:
         if len(selected_tests) != 1:
             print("Error: Please select exactly one evaluation test for 'create rft'.")
             return 1
+        # Derive evaluator_id from user's single selection
         chosen = selected_tests[0]
         func_name = chosen.qualname.split(".")[-1]
         source_file_name = os.path.splitext(os.path.basename(chosen.file_path))[0]
         evaluator_id = _normalize_evaluator_id(f"{source_file_name}-{func_name}")
-        selected_test_file_path = chosen.file_path
-        selected_test_func_name = func_name
+        # Resolve selected test once for downstream
+        selected_test_file_path, selected_test_func_name = _resolve_selected_test(
+            project_root, evaluator_id, selected_tests=selected_tests
+        )
     # Resolve evaluator resource name to fully-qualified format required by API
     evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}"
 
@@ -392,6 +426,11 @@ def create_rft_command(args) -> int:
                     print("   Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
                     return 1
                 skip_upload = True
+                # Populate selected test info for dataset inference later
+                st_path, st_func = _resolve_selected_test(project_root, evaluator_id)
+                if st_path and st_func:
+                    selected_test_file_path = st_path
+                    selected_test_func_name = st_func
         except requests.exceptions.RequestException:
             pass
 
@@ -402,32 +441,16 @@ def create_rft_command(args) -> int:
 
             tests = _discover_tests(project_root)
             selected_entry: Optional[str] = None
-            if len(tests) == 1:
-                func_name = tests[0].qualname.split(".")[-1]
-                abs_path = os.path.abspath(tests[0].file_path)
+            st_path, st_func = _resolve_selected_test(project_root, evaluator_id, selected_tests=tests)
+            if st_path and st_func:
+                abs_path = os.path.abspath(st_path)
                 try:
                     rel = os.path.relpath(abs_path, project_root)
                 except Exception:
                     rel = abs_path
-                selected_entry = f"{rel}::{func_name}"
-                selected_test_file_path = tests[0].file_path
-                selected_test_func_name = func_name
-            else:
-                # Try to match evaluator_id to a discovered test's normalized ID
-                for t in tests:
-                    func_name = t.qualname.split(".")[-1]
-                    source_file_name = os.path.splitext(os.path.basename(t.file_path))[0]
-                    candidate = _normalize_evaluator_id(f"{source_file_name}-{func_name}")
-                    if candidate == evaluator_id:
-                        abs_path = os.path.abspath(t.file_path)
-                        try:
-                            rel = os.path.relpath(abs_path, project_root)
-                        except Exception:
-                            rel = abs_path
-                        selected_entry = f"{rel}::{func_name}"
-                        selected_test_file_path = t.file_path
-                        selected_test_func_name = func_name
-                        break
+                selected_entry = f"{rel}::{st_func}"
+                selected_test_file_path = st_path
+                selected_test_func_name = st_func
             # If still unresolved and multiple tests exist, fail fast to avoid uploading unintended evaluators
             if selected_entry is None and len(tests) > 1:
                 print(
diff --git a/tests/test_cli_create_rft_infer.py b/tests/test_cli_create_rft_infer.py
@@ -615,3 +615,96 @@ def _fake_create_dataset_from_jsonl(account_id, api_key, api_base, dataset_id, d
     assert captured["dataset_id"] is not None
     assert captured["dataset_id"].startswith("test-input-ds-test-input-ds-dataset-")
     assert captured["jsonl_path"] == str(id_jsonl)
+
+
+def test_create_rft_quiet_existing_evaluator_infers_dataset_from_matching_test(tmp_path, monkeypatch):
+    # Setup project with multiple tests; evaluator exists (skip upload)
+    project = tmp_path / "proj"
+    project.mkdir()
+    monkeypatch.chdir(project)
+
+    # Env
+    monkeypatch.setenv("FIREWORKS_API_KEY", "fw_dummy")
+    monkeypatch.setenv("FIREWORKS_ACCOUNT_ID", "acct123")
+    monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai")
+
+    # Two tests discovered
+    f1 = project / "evals" / "alpha.py"
+    f2 = project / "evals" / "beta.py"
+    f1.parent.mkdir(parents=True, exist_ok=True)
+    f1.write_text("# alpha", encoding="utf-8")
+    f2.write_text("# beta", encoding="utf-8")
+    d1 = SimpleNamespace(qualname="alpha.test_one", file_path=str(f1))
+    d2 = SimpleNamespace(qualname="beta.test_two", file_path=str(f2))
+    monkeypatch.setattr(cr, "_discover_tests", lambda cwd: [d1, d2])
+
+    # Evaluator exists and is ACTIVE (skip upload)
+    class _Resp:
+        ok = True
+
+        def json(self):
+            return {"state": "ACTIVE"}
+
+        def raise_for_status(self):
+            return None
+
+    monkeypatch.setattr(cr.requests, "get", lambda *a, **k: _Resp())
+    monkeypatch.setattr(cr, "_poll_evaluator_status", lambda **kwargs: True)
+
+    # We will provide JSONL via input_dataset extractor for matching test (beta.test_two)
+    jsonl_path = project / "data.jsonl"
+    jsonl_path.write_text('{"c":3}\n', encoding="utf-8")
+
+    # Stub extractors: only the matching test name should matter; our implementation calls extractor with file+func
+    def _extract_input_jsonl(file_path, func_name):
+        # Simulate returning JSONL regardless; dataset inference uses the selected test determined by evaluator_id
+        return str(jsonl_path)
+
+    monkeypatch.setattr(cr, "_extract_jsonl_from_dataloader", lambda f, fn: None)
+    monkeypatch.setattr(cr, "_extract_jsonl_from_input_dataset", _extract_input_jsonl)
+    monkeypatch.setattr(cr, "detect_dataset_builder", lambda metric_dir: None)
+
+    captured = {"dataset_id": None, "jsonl_path": None}
+
+    def _fake_create_dataset_from_jsonl(account_id, api_key, api_base, dataset_id, display_name, jsonl_path):
+        captured["dataset_id"] = dataset_id
+        captured["jsonl_path"] = jsonl_path
+        return dataset_id, {"name": f"accounts/{account_id}/datasets/{dataset_id}", "state": "UPLOADING"}
+
+    monkeypatch.setattr(cr, "create_dataset_from_jsonl", _fake_create_dataset_from_jsonl)
+    monkeypatch.setattr(cr, "create_reinforcement_fine_tuning_job", lambda *a, **k: {"name": "jobs/123"})
+
+    import argparse
+
+    # Provide evaluator_id that matches beta.test_two
+    eval_id = cr._normalize_evaluator_id("beta-test_two")
+    args = argparse.Namespace(
+        evaluator_id=eval_id,
+        yes=True,
+        dry_run=False,
+        force=False,
+        env_file=None,
+        dataset_id=None,
+        dataset_jsonl=None,
+        dataset_display_name=None,
+        dataset_builder=None,
+        base_model=None,
+        warm_start_from="accounts/acct123/models/ft-abc123",
+        output_model=None,
+        n=None,
+        max_tokens=None,
+        learning_rate=None,
+        batch_size=None,
+        epochs=None,
+        lora_rank=None,
+        max_context_length=None,
+        chunk_size=None,
+        eval_auto_carveout=None,
+    )
+
+    rc = cr.create_rft_command(args)
+    assert rc == 0
+    assert captured["dataset_id"] is not None
+    # Ensure the dataset id is based on evaluator_id
+    assert captured["dataset_id"].startswith(f"{eval_id}-dataset-")
+    assert captured["jsonl_path"] == str(jsonl_path)