cli reduce

shreymodi1 · shreymodi1 · commit ce6277f2dc17 · 2026-01-08T17:49:05.000-08:00
diff --git a/eval_protocol/__init__.py b/eval_protocol/__init__.py
@@ -9,11 +9,25 @@
 """
 
 import importlib
+import sys
 import warnings
 from typing import TYPE_CHECKING
 
 warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
 
+# Eager imports for symbols that conflict with module names - ONLY when pytest is running.
+# The reward_function.py module exports RewardFunction class, and we also export the
+# reward_function decorator from typed_interface. When pytest's AssertionRewritingHook
+# imports eval_protocol.reward_function as a module, Python would set
+# eval_protocol.reward_function to the module, shadowing our function export.
+#
+# We detect pytest by checking if _pytest or pytest is already loaded. This avoids
+# the ~500ms import overhead for non-test scenarios like the CLI.
+_running_under_pytest = "_pytest" in sys.modules or "pytest" in sys.modules
+if _running_under_pytest:
+    from .reward_function import RewardFunction  # noqa: E402
+    from .typed_interface import reward_function  # noqa: E402
+
 # Lazy import mappings: name -> (module_path, attribute_name or None for module import)
 _LAZY_IMPORTS = {
     # From .auth
@@ -36,9 +50,12 @@
     # From .data_loader
     "DynamicDataLoader": (".data_loader", "DynamicDataLoader"),
     "InlineDataLoader": (".data_loader", "InlineDataLoader"),
-    # Submodules
+    # Submodules (accessible as eval_protocol.submodule)
     "mcp": (".mcp", None),
     "rewards": (".rewards", None),
+    "models": (".models", None),
+    "auth": (".auth", None),
+    "config": (".config", None),
     # From .models
     "EvaluateResult": (".models", "EvaluateResult"),
     "Message": (".models", "Message"),
diff --git a/pytest.ini b/pytest.ini
@@ -1,6 +1,7 @@
 [pytest]
 markers =
     asyncio
+    benchmark: marks tests as benchmark tests (for CLI startup time checks)
 asyncio_mode = auto
 asyncio_default_fixture_loop_scope = function
 testpaths = tests ./eval_protocol/quickstart
diff --git a/tests/test_cli_startup_benchmark.py b/tests/test_cli_startup_benchmark.py
@@ -0,0 +1,114 @@
+"""
+Benchmark test for CLI startup time.
+
+This test ensures the CLI startup time stays under the target threshold.
+Run with: pytest tests/test_cli_startup_benchmark.py -v
+"""
+
+import subprocess
+import sys
+import time
+
+import pytest
+
+# Target: CLI should start in under 0.4 seconds
+CLI_STARTUP_TARGET_SECONDS = 0.4
+
+# Number of runs to average (first run may be slower due to cold cache)
+NUM_RUNS = 3
+
+
+def measure_cli_startup_time() -> float:
+    """Measure CLI --help startup time in seconds."""
+    start = time.perf_counter()
+    result = subprocess.run(
+        [sys.executable, "-m", "eval_protocol.cli", "--help"],
+        capture_output=True,
+        text=True,
+        env={**dict(__import__("os").environ), "FIREWORKS_API_KEY": "benchmark-test-key"},
+    )
+    elapsed = time.perf_counter() - start
+
+    # Ensure the command succeeded
+    assert result.returncode == 0, f"CLI failed: {result.stderr}"
+
+    return elapsed
+
+
+@pytest.mark.benchmark
+def test_cli_startup_time():
+    """Test that CLI startup time is under the target threshold."""
+    times = []
+
+    for i in range(NUM_RUNS):
+        elapsed = measure_cli_startup_time()
+        times.append(elapsed)
+        print(f"  Run {i + 1}: {elapsed:.3f}s")
+
+    avg_time = sum(times) / len(times)
+    min_time = min(times)
+    max_time = max(times)
+
+    print(f"\n  Average: {avg_time:.3f}s")
+    print(f"  Min: {min_time:.3f}s")
+    print(f"  Max: {max_time:.3f}s")
+    print(f"  Target: {CLI_STARTUP_TARGET_SECONDS}s")
+
+    # Use the best time (min) as some CI environments have variable overhead
+    assert min_time < CLI_STARTUP_TARGET_SECONDS, (
+        f"CLI startup time ({min_time:.3f}s) exceeds target ({CLI_STARTUP_TARGET_SECONDS}s). "
+        f"Check for import-time side effects or eager module loading."
+    )
+
+
+@pytest.mark.benchmark
+def test_package_import_time():
+    """Test that importing eval_protocol package is fast (lazy loading check)."""
+    # Use subprocess to get a clean import measurement
+    code = """
+import time
+start = time.perf_counter()
+import eval_protocol
+elapsed = time.perf_counter() - start
+print(f"{elapsed:.6f}")
+"""
+    result = subprocess.run(
+        [sys.executable, "-c", code],
+        capture_output=True,
+        text=True,
+    )
+
+    assert result.returncode == 0, f"Import failed: {result.stderr}"
+
+    import_time = float(result.stdout.strip())
+    print(f"\n  Package import time: {import_time * 1000:.1f}ms")
+
+    # Package import should be very fast with lazy loading (< 50ms)
+    assert import_time < 0.05, (
+        f"Package import time ({import_time * 1000:.1f}ms) is too slow. "
+        f"Check that __init__.py uses lazy loading correctly."
+    )
+
+
+if __name__ == "__main__":
+    print("=== CLI Startup Benchmark ===\n")
+
+    print("Testing CLI startup time...")
+    times = []
+    for i in range(NUM_RUNS):
+        elapsed = measure_cli_startup_time()
+        times.append(elapsed)
+        print(f"  Run {i + 1}: {elapsed:.3f}s")
+
+    avg_time = sum(times) / len(times)
+    min_time = min(times)
+
+    print(f"\n  Average: {avg_time:.3f}s")
+    print(f"  Best: {min_time:.3f}s")
+    print(f"  Target: {CLI_STARTUP_TARGET_SECONDS}s")
+
+    if min_time < CLI_STARTUP_TARGET_SECONDS:
+        print(f"\n✓ PASS: CLI startup ({min_time:.3f}s) is under target ({CLI_STARTUP_TARGET_SECONDS}s)")
+    else:
+        print(f"\n✗ FAIL: CLI startup ({min_time:.3f}s) exceeds target ({CLI_STARTUP_TARGET_SECONDS}s)")
+        sys.exit(1)