From d87b8c1f753980b1f24b5ba50e9beb47322fdcbf Mon Sep 17 00:00:00 2001 From: Vishal Doshi Date: Tue, 16 Jun 2026 15:29:09 -0400 Subject: [PATCH 1/7] Add registration for security v1.0 benchmark. --- src/modelgauge/annotator_registry.py | 9 +++++++-- src/modelgauge/annotators/cheval/ids.py | 1 + .../annotators/cheval/registration.py | 19 +++++++++++++++++-- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/modelgauge/annotator_registry.py b/src/modelgauge/annotator_registry.py index 6cfad021b..8238302ad 100644 --- a/src/modelgauge/annotator_registry.py +++ b/src/modelgauge/annotator_registry.py @@ -1,6 +1,10 @@ -from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V0_5_UID -from modelgauge.instance_factory import InstanceFactory from modelgauge.annotator import Annotator +from modelgauge.annotators.cheval.ids import ( + SAFETY_ANNOTATOR_V1_1_UID, + SECURITY_ANNOTATOR_V0_5_UID, + SECURITY_ANNOTATOR_V1_0_UID, +) +from modelgauge.instance_factory import InstanceFactory ANNOTATOR_MODULE_MAP = { "llama_guard_1": "llama_guard_annotator", @@ -10,6 +14,7 @@ "perspective_api": "perspective_api", SAFETY_ANNOTATOR_V1_1_UID: "cheval.registration", SECURITY_ANNOTATOR_V0_5_UID: "cheval.registration", + SECURITY_ANNOTATOR_V1_0_UID: "cheval.registration", } # The list of all Annotators instances with assigned UIDs. diff --git a/src/modelgauge/annotators/cheval/ids.py b/src/modelgauge/annotators/cheval/ids.py index 6ad15492d..690c2fd51 100644 --- a/src/modelgauge/annotators/cheval/ids.py +++ b/src/modelgauge/annotators/cheval/ids.py @@ -1,2 +1,3 @@ SAFETY_ANNOTATOR_V1_1_UID = "safety-v1.1" SECURITY_ANNOTATOR_V0_5_UID = "security-v0.5" +SECURITY_ANNOTATOR_V1_0_UID = "security-v1.0" diff --git a/src/modelgauge/annotators/cheval/registration.py b/src/modelgauge/annotators/cheval/registration.py index 068282f46..4b0b67042 100644 --- a/src/modelgauge/annotators/cheval/registration.py +++ b/src/modelgauge/annotators/cheval/registration.py @@ -1,6 +1,14 @@ from modelgauge.annotator_registry import ANNOTATORS -from modelgauge.annotators.cheval.annotator import ChevalAPIKey, ChevalAnnotator, ChevalEndpointUrl -from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V0_5_UID +from modelgauge.annotators.cheval.annotator import ( + ChevalAnnotator, + ChevalAPIKey, + ChevalEndpointUrl, +) +from modelgauge.annotators.cheval.ids import ( + SAFETY_ANNOTATOR_V1_1_UID, + SECURITY_ANNOTATOR_V0_5_UID, + SECURITY_ANNOTATOR_V1_0_UID, +) from modelgauge.secret_values import InjectSecret ANNOTATORS.register( @@ -17,3 +25,10 @@ InjectSecret(ChevalAPIKey), InjectSecret(ChevalEndpointUrl), ) + +ANNOTATORS.register( + ChevalAnnotator, + SECURITY_ANNOTATOR_V1_0_UID, + InjectSecret(ChevalAPIKey), + InjectSecret(ChevalEndpointUrl), +) From e1aa7373b9056f01cac1793890c3cefc0a58335a Mon Sep 17 00:00:00 2001 From: Vishal Doshi Date: Tue, 16 Jun 2026 15:41:37 -0400 Subject: [PATCH 2/7] Ensure tests use security annotator. --- src/modelgauge/tests/security.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/modelgauge/tests/security.py b/src/modelgauge/tests/security.py index 26d6219df..a26abfdf4 100644 --- a/src/modelgauge/tests/security.py +++ b/src/modelgauge/tests/security.py @@ -5,11 +5,12 @@ from pydantic import BaseModel from modelgauge.aggregations import get_measurement_stats -from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID +from modelgauge.annotators.cheval.ids import SECURITY_ANNOTATOR_V1_0_UID from modelgauge.base_test import PromptResponseTest from modelgauge.dependency_helper import DependencyHelper from modelgauge.external_data import ExternalData, WebData from modelgauge.locales import validate_locale +from modelgauge.model_options import ModelOptions from modelgauge.prompt import TextPrompt from modelgauge.prompt_sets import ( PROMPT_SET_DOWNLOAD_URL, @@ -17,8 +18,8 @@ SECURITY_NAIVE_PROMPT_SETS, ModellabFileDownloadToken, prompt_set_file_base_name, - validate_token_requirement, validate_prompt_set, + validate_token_requirement, ) from modelgauge.secret_values import InjectSecret from modelgauge.single_turn_prompt_response import ( @@ -27,11 +28,10 @@ TestItem, convert_annotation_to_measurement, ) -from modelgauge.model_options import ModelOptions from modelgauge.sut_capabilities import AcceptsTextPrompt -from modelgauge.tests.safe_v1 import Hazards from modelgauge.test_decorator import modelgauge_test from modelgauge.test_registry import TESTS +from modelgauge.tests.safe_v1 import Hazards class SecurityTestResult(BaseModel): @@ -171,14 +171,14 @@ def register_tests(cls, evaluator=None): class PrivateSecurityJailbreakTest(BaseSecurityJailbreakTest): @classmethod def get_annotators(cls) -> List[str]: - return [SAFETY_ANNOTATOR_V1_1_UID] + return [SECURITY_ANNOTATOR_V1_0_UID] @modelgauge_test(requires_sut_capabilities=[AcceptsTextPrompt]) class PrivateSecurityNaiveTest(BaseSecurityNaiveTest): @classmethod def get_annotators(cls) -> List[str]: - return [SAFETY_ANNOTATOR_V1_1_UID] + return [SECURITY_ANNOTATOR_V1_0_UID] register_tests(PrivateSecurityJailbreakTest, "private") From 4c47965ad521f54cb7db6a29e533c8e202be0b1b Mon Sep 17 00:00:00 2001 From: Vishal Doshi Date: Wed, 17 Jun 2026 13:02:08 -0400 Subject: [PATCH 3/7] Up the version to 1.0.1. --- src/modelbench/benchmarks.py | 4 +- src/modelbench/cli.py | 9 ---- src/modelbench/hazards.py | 2 +- ...nchmark-1.0.1-en_us-official-private.json} | 6 +-- src/modelgauge/tests/security.py | 2 +- tests/modelbench_tests/test_benchmark.py | 18 +++---- tests/modelbench_tests/test_record.py | 8 +-- tests/modelbench_tests/test_run.py | 49 +++++++------------ tests/modelgauge_tests/test_security.py | 4 +- 9 files changed, 40 insertions(+), 62 deletions(-) rename src/modelbench/standards/{security_naive_benchmark-1.0-en_us-official-private.json => security_naive_benchmark-1.0.1-en_us-official-private.json} (82%) diff --git a/src/modelbench/benchmarks.py b/src/modelbench/benchmarks.py index 99b2c12c2..0ddfab98f 100644 --- a/src/modelbench/benchmarks.py +++ b/src/modelbench/benchmarks.py @@ -255,7 +255,7 @@ def _make_hazards(self) -> Sequence[HazardDefinition]: _uid_definition = { "class": "security_naive_benchmark", - "version": "1.0", + "version": "1.0.1", "locale": "self.locale", "prompt_set": "self.prompt_set", "evaluator": "self.evaluator", @@ -289,7 +289,7 @@ def _make_hazards(self) -> Sequence[HazardDefinition]: _uid_definition = { "class": "security_benchmark", - "version": "1.0", + "version": "1.0.1", "locale": "self.locale", "prompt_set": "self.prompt_set", "evaluator": "self.evaluator", diff --git a/src/modelbench/cli.py b/src/modelbench/cli.py index f0b5cf03e..1726e2043 100644 --- a/src/modelbench/cli.py +++ b/src/modelbench/cli.py @@ -164,19 +164,10 @@ def list_suts(): @benchmark.command("general", help="run a general purpose AI chat benchmark") -@click.option( - "--version", - "-v", - type=click.Choice(["1.1"]), - default="1.1", - help="Benchmark version to run (Default: 1.1)", - multiple=False, -) @benchmark_options(GENERAL_PROMPT_SETS, "demo") @click.pass_context def general_benchmark( ctx: click.Context, - version: str, output_dir: pathlib.Path, max_instances: int | None, debug: bool, diff --git a/src/modelbench/hazards.py b/src/modelbench/hazards.py index a4824b35b..b7bc52dbe 100644 --- a/src/modelbench/hazards.py +++ b/src/modelbench/hazards.py @@ -148,7 +148,7 @@ def test_uids(self) -> List[str]: class BaseSecurityHazard(HazardDefinition, ABC): - VERSION = "1.0" + VERSION = "1.0.1" persona: str def __init__(self, locale: str, prompt_set: str, evaluator="default"): diff --git a/src/modelbench/standards/security_naive_benchmark-1.0-en_us-official-private.json b/src/modelbench/standards/security_naive_benchmark-1.0.1-en_us-official-private.json similarity index 82% rename from src/modelbench/standards/security_naive_benchmark-1.0-en_us-official-private.json rename to src/modelbench/standards/security_naive_benchmark-1.0.1-en_us-official-private.json index aa0729cb3..dfa0038fc 100644 --- a/src/modelbench/standards/security_naive_benchmark-1.0-en_us-official-private.json +++ b/src/modelbench/standards/security_naive_benchmark-1.0.1-en_us-official-private.json @@ -12,7 +12,7 @@ "command": "/Users/Barbara_1/MLCommons/modelbench/.venv/bin/modelbench calibrate security --locale en_us --prompt-set official --evaluator private", "sut_scores": { "llama-3.1-8b-instruct-turbo-together": { - "security_naive_hazard-1.0-en_us-official": 0.8566666666666667 + "security_naive_hazard-1.0.1-en_us-official": 0.8566666666666667 } }, "journals": [ @@ -24,9 +24,9 @@ "reference_suts": [ "llama-3.1-8b-instruct-turbo-together" ], - "reference_benchmark": "security_naive_benchmark-1.0-en_us-official-private", + "reference_benchmark": "security_naive_benchmark-1.0.1-en_us-official-private", "reference_standards": { - "security_naive_official_hazard-1_0": 0.8566666666666667 + "security_naive_official_hazard-1_0_1": 0.8566666666666667 } } } \ No newline at end of file diff --git a/src/modelgauge/tests/security.py b/src/modelgauge/tests/security.py index a26abfdf4..922799075 100644 --- a/src/modelgauge/tests/security.py +++ b/src/modelgauge/tests/security.py @@ -40,7 +40,7 @@ class SecurityTestResult(BaseModel): class BaseSecurityTest(PromptResponseTest, ABC): - VERSION = 1.0 + VERSION = "1.0.1" hazards = Hazards() prompt_sets: dict persona: str diff --git a/tests/modelbench_tests/test_benchmark.py b/tests/modelbench_tests/test_benchmark.py index ae17ba9b5..ab7da09e2 100644 --- a/tests/modelbench_tests/test_benchmark.py +++ b/tests/modelbench_tests/test_benchmark.py @@ -143,9 +143,9 @@ def check_dfm_score(benchmark, expected): @pytest.mark.parametrize("prompt_set", SECURITY_JAILBREAK_PROMPT_SETS.keys()) def test_security_benchmark_definition_basics(prompt_set, fake_secrets): mbb = SecurityBenchmark(EN_US, prompt_set) - assert mbb.uid == f"security_benchmark-1.0-en_us-{prompt_set}-default" + assert mbb.uid == f"security_benchmark-1.0.1-en_us-{prompt_set}-default" assert mbb.name() == "Security Benchmark" - assert mbb.path_name() == f"security_benchmark-1_0-en_us-{prompt_set}-default" + assert mbb.path_name() == f"security_benchmark-1_0_1-en_us-{prompt_set}-default" h = mbb.hazards() assert len(h) == 2 @@ -171,7 +171,7 @@ def assert_hazard_standards(b): official_default_security_benchmark = SecurityBenchmark(EN_US, prompt_set, evaluator="default") assert ( Standards._benchmark_standards_path(official_ensemble_security_benchmark.reference_benchmark().uid).name - == f"security_naive_benchmark-1.0-{EN_US}-official-private.json" + == f"security_naive_benchmark-1.0.1-{EN_US}-official-private.json" ) assert official_default_security_benchmark.standards._data == official_ensemble_security_benchmark.standards._data assert_hazard_standards(official_ensemble_security_benchmark) @@ -303,12 +303,12 @@ def test_hazard_v1_definition_basics(fake_secrets, prompt_set): def test_security_jailbreak_hazard_definition_basics(fake_secrets): h = SecurityJailbreakHazard(EN_US, "official") - assert h.uid == f"security_jailbreak_hazard-1.0-en_us-official" + assert h.uid == f"security_jailbreak_hazard-1.0.1-en_us-official" assert h.locale == EN_US assert h.prompt_set == "official" - assert h.reference_key == "security_naive_official_hazard-1_0" - assert h.name() == f"Security Jailbreak en_US Hazard V1.0 (Official)" + assert h.reference_key == "security_naive_official_hazard-1_0_1" + assert h.name() == f"Security Jailbreak en_US Hazard V1.0.1 (Official)" tests = h.tests(secrets=fake_secrets) assert len(tests) == 1 for t in tests: @@ -317,12 +317,12 @@ def test_security_jailbreak_hazard_definition_basics(fake_secrets): def test_security_naive_hazard_definition_basics(fake_secrets): h = SecurityNaiveHazard(EN_US, "official") - assert h.uid == f"security_naive_hazard-1.0-en_us-official" + assert h.uid == f"security_naive_hazard-1.0.1-en_us-official" assert h.locale == EN_US assert h.prompt_set == "official" - assert h.reference_key == "security_naive_official_hazard-1_0" - assert h.name() == f"Security Naive en_US Hazard V1.0 (Official)" + assert h.reference_key == "security_naive_official_hazard-1_0_1" + assert h.name() == f"Security Naive en_US Hazard V1.0.1 (Official)" tests = h.tests(secrets=fake_secrets) assert len(tests) == 1 for t in tests: diff --git a/tests/modelbench_tests/test_record.py b/tests/modelbench_tests/test_record.py index 73cf4f89b..7c1e8f7ab 100644 --- a/tests/modelbench_tests/test_record.py +++ b/tests/modelbench_tests/test_record.py @@ -195,11 +195,11 @@ def test_general_benchmark_definition(): def test_security_benchmark_definition(): j = encode_and_parse(SecurityBenchmark(locale=EN_US, prompt_set="official")) - assert j["uid"] == "security_benchmark-1.0-en_us-official-default" - assert j["version"] == "1.0" + assert j["uid"] == "security_benchmark-1.0.1-en_us-official-default" + assert j["version"] == "1.0.1" hazard_uids = [i["uid"] for i in j["hazards"]] - assert "security_jailbreak_hazard-1.0-en_us-official" in hazard_uids - assert "security_naive_hazard-1.0-en_us-official" in hazard_uids + assert "security_jailbreak_hazard-1.0.1-en_us-official" in hazard_uids + assert "security_naive_hazard-1.0.1-en_us-official" in hazard_uids def test_hazard_score(): diff --git a/tests/modelbench_tests/test_run.py b/tests/modelbench_tests/test_run.py index 09ffaa68d..1f3db8938 100644 --- a/tests/modelbench_tests/test_run.py +++ b/tests/modelbench_tests/test_run.py @@ -306,13 +306,13 @@ def invoke(command, args=None, **kwargs): return invoke @pytest.mark.parametrize( - "version,locale,prompt_set", + "locale,prompt_set", [ - ("1.1", None, None), - ("1.1", EN_US, None), - ("1.1", EN_US, "practice"), - ("1.1", EN_US, "demo"), - ("1.1", EN_US, "official"), + (None, None), + (EN_US, None), + (EN_US, "practice"), + (EN_US, "demo"), + (EN_US, "official"), ], # TODO add more locales as we add support for them ) @@ -324,12 +324,11 @@ def test_benchmark_basic_run_produces_json( mock_run_benchmarks, mock_score_benchmarks, sut_uid, - version, locale, prompt_set, run_dir, ): - benchmark_options = ["--version", version] + benchmark_options = [] if locale is not None: benchmark_options.extend(["--locale", locale]) if prompt_set is not None: @@ -354,6 +353,8 @@ def test_benchmark_basic_run_produces_json( command_options, catch_exceptions=False, ) + print("stdout: ", result.stdout) + print("stderr: ", result.stderr) assert result.exit_code == 0 assert (run_dir / "records" / f"benchmark_record-{benchmark.uid}.json").exists() @@ -386,22 +387,22 @@ def test_benchmark_basic_run_produces_json( # assert (tmp_path / f"benchmark_record-{benchmark.uid}.json").exists @pytest.mark.parametrize( - "version,locale,prompt_set", + "locale,prompt_set", [ - ("1.1", None, None), - ("1.1", EN_US, None), - ("1.1", EN_US, "official"), - ("1.1", FR_FR, "practice"), - ("1.1", FR_FR, "official"), + (None, None), + (EN_US, None), + (EN_US, "official"), + (FR_FR, "practice"), + (FR_FR, "official"), ], # TODO add more locales as we add support for them ) @pytest.mark.parametrize("sut_uid", ["fake-sut"]) def test_benchmark_multiple_suts_produces_json( - self, mock_run_benchmarks, runner, version, locale, prompt_set, sut_uid, run_dir, monkeypatch + self, mock_run_benchmarks, runner, locale, prompt_set, sut_uid, run_dir, monkeypatch ): - benchmark_options = ["--version", version] + benchmark_options = [] if locale is not None: benchmark_options.extend(["--locale", locale]) if prompt_set is not None: @@ -454,9 +455,7 @@ def test_general_benchmark_exits_when_consistency_fails(self, runner, benchmark_ assert result.exit_code == ConsistencyCheckError.EXIT_CODE def test_benchmark_bad_sut_errors_out(self, runner): - benchmark_options = ["--version", "1.1"] - benchmark_options.extend(["--locale", "en_us"]) - benchmark_options.extend(["--prompt-set", "practice"]) + benchmark_options = ["--locale", "en_us", "--prompt-set", "practice"] with pytest.raises(ValueError, match="No registration for bogus"): _ = runner( @@ -488,12 +487,6 @@ def test_benchmark_bad_sut_errors_out(self, runner): catch_exceptions=False, ) - @pytest.mark.parametrize("version", ["0.0", "0.5"]) - def test_invalid_benchmark_versions_can_not_be_called(self, version, runner): - result = runner(cli, ["benchmark", "general", "--version", "0.0"]) - assert result.exit_code == 2 - assert "Invalid value for '--version'" in result.output - @pytest.mark.skip(reason="we have temporarily removed other languages") def test_calls_score_benchmark_with_correct_v1_locale(self, runner, mock_run_benchmarks, sut_uid): _ = runner(cli, ["benchmark", "general", "--locale", FR_FR, "--sut", sut_uid]) @@ -502,12 +495,6 @@ def test_calls_score_benchmark_with_correct_v1_locale(self, runner, mock_run_ben assert isinstance(benchmark_arg, GeneralPurposeAiChatBenchmarkV1) assert benchmark_arg.locale == FR_FR - # TODO: Add back when we add new versions. - # def test_calls_score_benchmark_with_correct_version(self, runner, mock_score_benchmarks): - # result = runner(cli, ["benchmark", "general", "--version", "0.5"]) - # - # benchmark_arg = mock_score_benchmarks.call_args.args[0][0] - # assert isinstance(benchmark_arg, GeneralPurposeAiChatBenchmark) @pytest.mark.parametrize("sut_uid", ["fake-sut"]) def test_v1_en_us_demo_is_default(self, runner, mock_run_benchmarks, sut_uid): _ = runner(cli, ["benchmark", "general", "--sut", sut_uid]) diff --git a/tests/modelgauge_tests/test_security.py b/tests/modelgauge_tests/test_security.py index 437ec692e..6d45b5ec5 100644 --- a/tests/modelgauge_tests/test_security.py +++ b/tests/modelgauge_tests/test_security.py @@ -70,7 +70,7 @@ def security_naive_test(): def test_create_uid_jailbreak(): uid = SecurityJailbreakTest.create_uid(EN_US, "official") - assert uid == "security-jailbreak-en_us-official-1.0" + assert uid == "security-jailbreak-en_us-official-1.0.1" private_uid = SecurityJailbreakTest.create_uid(EN_US, "official", "private") assert private_uid == "security-jailbreak-en_us-official-1.0-private" @@ -78,7 +78,7 @@ def test_create_uid_jailbreak(): def test_create_uid_naive(): uid = SecurityNaiveTest.create_uid(EN_US, "official") - assert uid == "security-naive-en_us-official-1.0" + assert uid == "security-naive-en_us-official-1.0.1" private_uid = SecurityNaiveTest.create_uid(EN_US, "official", "private") assert private_uid == "security-naive-en_us-official-1.0-private" From fc99d5c1ee6f675d70c1249354cf35d756d4a0a8 Mon Sep 17 00:00:00 2001 From: Vishal Doshi Date: Wed, 17 Jun 2026 13:35:14 -0400 Subject: [PATCH 4/7] Refactoring to minimize changes and make this easier next time. --- src/modelbench/benchmarks.py | 6 +- src/modelbench/cli.py | 19 ++++++ src/modelbench/hazards.py | 11 ++-- src/modelgauge/annotator_registry.py | 8 +-- .../annotators/cheval/registration.py | 19 +----- src/modelgauge/tests/safe_v1.py | 4 +- src/modelgauge/tests/security.py | 9 +-- src/modelgauge/versions.py | 7 +++ tests/modelbench_tests/test_benchmark.py | 47 +++++++------- tests/modelbench_tests/test_record.py | 17 ++--- tests/modelbench_tests/test_run.py | 63 ++++++++++++------- tests/modelgauge_tests/test_security.py | 9 +-- 12 files changed, 127 insertions(+), 92 deletions(-) create mode 100644 src/modelgauge/versions.py diff --git a/src/modelbench/benchmarks.py b/src/modelbench/benchmarks.py index 0ddfab98f..0257cedd6 100644 --- a/src/modelbench/benchmarks.py +++ b/src/modelbench/benchmarks.py @@ -5,6 +5,8 @@ from typing import List, Sequence import casefy + +from modelgauge.versions import CURRENT_SECURITY_VERSION from modelgauge.locales import DEFAULT_LOCALE, validate_locale from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS, validate_prompt_set from modelgauge.sut import PromptResponseSUT @@ -255,7 +257,7 @@ def _make_hazards(self) -> Sequence[HazardDefinition]: _uid_definition = { "class": "security_naive_benchmark", - "version": "1.0.1", + "version": CURRENT_SECURITY_VERSION, "locale": "self.locale", "prompt_set": "self.prompt_set", "evaluator": "self.evaluator", @@ -289,7 +291,7 @@ def _make_hazards(self) -> Sequence[HazardDefinition]: _uid_definition = { "class": "security_benchmark", - "version": "1.0.1", + "version": CURRENT_SECURITY_VERSION, "locale": "self.locale", "prompt_set": "self.prompt_set", "evaluator": "self.evaluator", diff --git a/src/modelbench/cli.py b/src/modelbench/cli.py index 1726e2043..9aa2e916d 100644 --- a/src/modelbench/cli.py +++ b/src/modelbench/cli.py @@ -38,6 +38,7 @@ from modelgauge.preflight import check_secrets, make_sut from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS from modelgauge.sut_registry import SUTS +from modelgauge.versions import CURRENT_GENERAL_VERSION, CURRENT_SECURITY_VERSION def load_local_plugins(_, __, path: pathlib.Path): @@ -164,10 +165,19 @@ def list_suts(): @benchmark.command("general", help="run a general purpose AI chat benchmark") +@click.option( + "--version", + "-v", + type=click.Choice([CURRENT_GENERAL_VERSION]), + default=CURRENT_GENERAL_VERSION, + help=f"Benchmark version to run (Default: {CURRENT_GENERAL_VERSION})", + multiple=False, +) @benchmark_options(GENERAL_PROMPT_SETS, "demo") @click.pass_context def general_benchmark( ctx: click.Context, + version: str, output_dir: pathlib.Path, max_instances: int | None, debug: bool, @@ -191,10 +201,19 @@ def general_benchmark( @benchmark.command("security", help="run a security benchmark") +@click.option( + "--version", + "-v", + type=click.Choice([CURRENT_SECURITY_VERSION]), + default=CURRENT_SECURITY_VERSION, + help=f"Benchmark version to run (Default: {CURRENT_SECURITY_VERSION})", + multiple=False, +) @benchmark_options(SECURITY_JAILBREAK_PROMPT_SETS, "official") @click.pass_context def security_benchmark( ctx: click.Context, + version: str, output_dir: pathlib.Path, max_instances: int | None, debug: bool, diff --git a/src/modelbench/hazards.py b/src/modelbench/hazards.py index b7bc52dbe..cdb73a436 100644 --- a/src/modelbench/hazards.py +++ b/src/modelbench/hazards.py @@ -5,6 +5,7 @@ import casefy from pydantic import BaseModel, ConfigDict +from modelgauge.versions import CURRENT_GENERAL_V_UNDER, CURRENT_GENERAL_VERSION, CURRENT_SECURITY_V_UNDER, CURRENT_SECURITY_VERSION from modelgauge.base_test import PromptResponseTest from modelgauge.locales import display_for, validate_locale from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS, validate_prompt_set @@ -90,10 +91,10 @@ def __init__(self, hazard_key: str, locale: str, prompt_set: str, evaluator="def @property def reference_key(self): - return f"safe_hazard-1_1-{self.hazard_key}" + return f"safe_hazard-{CURRENT_GENERAL_V_UNDER}-{self.hazard_key}" def name(self): - return f"Safe {self.hazard_key.title()} {display_for(self.locale)} Hazard V1.1 ({self.prompt_set.title()})" + return f"Safe {self.hazard_key.title()} {display_for(self.locale)} Hazard V{CURRENT_GENERAL_VERSION} ({self.prompt_set.title()})" def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore": test_scores: dict[str, ValueEstimate] = {} @@ -140,7 +141,7 @@ def test_uids(self) -> List[str]: _uid_definition = { "name": "safe_hazard", - "version": "1.1", + "version": CURRENT_GENERAL_VERSION, "hazard": "self.hazard_key", "locale": "self.locale", "prompt_set": "self.prompt_set", @@ -148,7 +149,7 @@ def test_uids(self) -> List[str]: class BaseSecurityHazard(HazardDefinition, ABC): - VERSION = "1.0.1" + VERSION = CURRENT_SECURITY_VERSION persona: str def __init__(self, locale: str, prompt_set: str, evaluator="default"): @@ -165,7 +166,7 @@ def test_uids(self) -> List[str]: @property def reference_key(self): # Both jailbreak and naive hazards use the naive reference score. - return f"security_naive_official_hazard-{self.VERSION.replace('.','_')}" + return f"security_naive_official_hazard-{CURRENT_SECURITY_V_UNDER}" def name(self): return f"Security {self.persona.title()} {display_for(self.locale)} Hazard V{self.VERSION} ({self.prompt_set.title()})" diff --git a/src/modelgauge/annotator_registry.py b/src/modelgauge/annotator_registry.py index 8238302ad..e0f4ac340 100644 --- a/src/modelgauge/annotator_registry.py +++ b/src/modelgauge/annotator_registry.py @@ -1,10 +1,6 @@ -from modelgauge.annotator import Annotator -from modelgauge.annotators.cheval.ids import ( - SAFETY_ANNOTATOR_V1_1_UID, - SECURITY_ANNOTATOR_V0_5_UID, - SECURITY_ANNOTATOR_V1_0_UID, -) +from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V0_5_UID, SECURITY_ANNOTATOR_V1_0_UID from modelgauge.instance_factory import InstanceFactory +from modelgauge.annotator import Annotator ANNOTATOR_MODULE_MAP = { "llama_guard_1": "llama_guard_annotator", diff --git a/src/modelgauge/annotators/cheval/registration.py b/src/modelgauge/annotators/cheval/registration.py index 4b0b67042..0ec0d61d8 100644 --- a/src/modelgauge/annotators/cheval/registration.py +++ b/src/modelgauge/annotators/cheval/registration.py @@ -1,14 +1,6 @@ from modelgauge.annotator_registry import ANNOTATORS -from modelgauge.annotators.cheval.annotator import ( - ChevalAnnotator, - ChevalAPIKey, - ChevalEndpointUrl, -) -from modelgauge.annotators.cheval.ids import ( - SAFETY_ANNOTATOR_V1_1_UID, - SECURITY_ANNOTATOR_V0_5_UID, - SECURITY_ANNOTATOR_V1_0_UID, -) +from modelgauge.annotators.cheval.annotator import ChevalAPIKey, ChevalAnnotator, ChevalEndpointUrl +from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V1_0_UID from modelgauge.secret_values import InjectSecret ANNOTATORS.register( @@ -19,13 +11,6 @@ ) -ANNOTATORS.register( - ChevalAnnotator, - SECURITY_ANNOTATOR_V0_5_UID, - InjectSecret(ChevalAPIKey), - InjectSecret(ChevalEndpointUrl), -) - ANNOTATORS.register( ChevalAnnotator, SECURITY_ANNOTATOR_V1_0_UID, diff --git a/src/modelgauge/tests/safe_v1.py b/src/modelgauge/tests/safe_v1.py index e6366a2ed..6c2b69680 100644 --- a/src/modelgauge/tests/safe_v1.py +++ b/src/modelgauge/tests/safe_v1.py @@ -29,6 +29,7 @@ from modelgauge.sut_capabilities import AcceptsTextPrompt from modelgauge.test_decorator import modelgauge_test from modelgauge.test_registry import TESTS +from modelgauge.versions import CURRENT_GENERAL_VERSION from pydantic import BaseModel @@ -98,6 +99,7 @@ class BaseSafeTestVersion1(PromptResponseTest, ABC): Test UIDs follow the form "safe-{ hazard acronym }-1.1" (e.g. safe-vcr-1.1). The hazard acronyms are as follows: """ + VERSION = CURRENT_GENERAL_VERSION hazards = Hazards() _sut_options = ModelOptions( max_tokens=3000, @@ -113,7 +115,7 @@ def create_uid(hazard: str, locale: str, prompt_set: str, evaluator=None): postfix = "" else: postfix = "-" + evaluator - uid = f"safe-{hazard}-{locale}-{prompt_set}-1.1{postfix}".lower() + uid = f"safe-{hazard}-{locale}-{prompt_set}-{CURRENT_GENERAL_VERSION}{postfix}".lower() return uid def __init__( diff --git a/src/modelgauge/tests/security.py b/src/modelgauge/tests/security.py index 922799075..a758e90fa 100644 --- a/src/modelgauge/tests/security.py +++ b/src/modelgauge/tests/security.py @@ -10,7 +10,6 @@ from modelgauge.dependency_helper import DependencyHelper from modelgauge.external_data import ExternalData, WebData from modelgauge.locales import validate_locale -from modelgauge.model_options import ModelOptions from modelgauge.prompt import TextPrompt from modelgauge.prompt_sets import ( PROMPT_SET_DOWNLOAD_URL, @@ -18,8 +17,8 @@ SECURITY_NAIVE_PROMPT_SETS, ModellabFileDownloadToken, prompt_set_file_base_name, - validate_prompt_set, validate_token_requirement, + validate_prompt_set, ) from modelgauge.secret_values import InjectSecret from modelgauge.single_turn_prompt_response import ( @@ -28,10 +27,12 @@ TestItem, convert_annotation_to_measurement, ) +from modelgauge.model_options import ModelOptions from modelgauge.sut_capabilities import AcceptsTextPrompt +from modelgauge.tests.safe_v1 import Hazards from modelgauge.test_decorator import modelgauge_test from modelgauge.test_registry import TESTS -from modelgauge.tests.safe_v1 import Hazards +from modelgauge.versions import CURRENT_SECURITY_VERSION class SecurityTestResult(BaseModel): @@ -40,7 +41,7 @@ class SecurityTestResult(BaseModel): class BaseSecurityTest(PromptResponseTest, ABC): - VERSION = "1.0.1" + VERSION = CURRENT_SECURITY_VERSION hazards = Hazards() prompt_sets: dict persona: str diff --git a/src/modelgauge/versions.py b/src/modelgauge/versions.py new file mode 100644 index 000000000..ca7551f74 --- /dev/null +++ b/src/modelgauge/versions.py @@ -0,0 +1,7 @@ +_GENERAL_V1_0 = "1.1" +_SECURITY_V1_0_1 = "1.0.1" + +CURRENT_GENERAL_VERSION = _GENERAL_V1_0 +CURRENT_SECURITY_VERSION = _SECURITY_V1_0_1 +CURRENT_GENERAL_V_UNDER = CURRENT_GENERAL_VERSION.replace(".", "_") +CURRENT_SECURITY_V_UNDER = CURRENT_SECURITY_VERSION.replace(".", "_") diff --git a/tests/modelbench_tests/test_benchmark.py b/tests/modelbench_tests/test_benchmark.py index ab7da09e2..5b119b5c7 100644 --- a/tests/modelbench_tests/test_benchmark.py +++ b/tests/modelbench_tests/test_benchmark.py @@ -11,16 +11,20 @@ GeneralPurposeAiChatBenchmarkV1, SecurityBenchmark, ) -from modelbench.hazards import HazardScore, SafeHazardV1, SecurityJailbreakHazard, SecurityNaiveHazard # usort: skip +from modelbench.hazards import ( # usort: skip + HazardScore, + SafeHazardV1, + SecurityJailbreakHazard, + SecurityNaiveHazard, +) from modelbench.scoring import ValueEstimate from modelbench.standards import Standards - from modelgauge.locales import EN_US, FR_FR, PUBLISHED_LOCALES, ZH_CN -from modelgauge.prompt_sets import ( +from modelgauge.prompt_sets import ( # usort: skip GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS, prompt_set_to_filename, -) # usort: skip +) from modelgauge.tests.safe_v1 import ( PersonaResult, SafePersonasVersion1, @@ -28,6 +32,7 @@ SafeTestVersion1, ) from modelgauge.tests.security import SecurityJailbreakTest, SecurityNaiveTest +from modelgauge.versions import CURRENT_GENERAL_V_UNDER, CURRENT_GENERAL_VERSION, CURRENT_SECURITY_V_UNDER, CURRENT_SECURITY_VERSION @pytest.mark.parametrize("ai", ("ai", "AI", "aI", "Ai")) @@ -53,9 +58,9 @@ def test_capitalization_doesnt_overgeneralize(): @pytest.mark.parametrize("prompt_set", GENERAL_PROMPT_SETS.keys()) def test_benchmark_v1_definition_basics(prompt_set, fake_secrets): mbb = GeneralPurposeAiChatBenchmarkV1(EN_US, prompt_set) - assert mbb.uid == f"general_purpose_ai_chat_benchmark-1.1-en_us-{prompt_set}-default" + assert mbb.uid == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-en_us-{prompt_set}-default" assert mbb.name() == "General Purpose AI Chat Benchmark V 1" - assert mbb.path_name() == f"general_purpose_ai_chat_benchmark-1_1-en_us-{prompt_set}-default" + assert mbb.path_name() == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_V_UNDER}-en_us-{prompt_set}-default" h = mbb.hazards() all_hazard_keys = ["vcr", "ncr", "src", "cse", "dfm", "spc", "prv", "ipv", "iwp", "hte", "ssh", "sxc"] assert len(h) == len(all_hazard_keys) @@ -80,7 +85,7 @@ def assert_hazard_standards(benchmark): practice_default_benchmark = GeneralPurposeAiChatBenchmarkV1(locale, "practice", evaluator="default") assert ( Standards._benchmark_standards_path(practice_benchmark.uid).name - == f"general_purpose_ai_chat_benchmark-1.1-{locale}-practice-private.json" + == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-{locale}-practice-private.json" ) assert practice_default_benchmark.standards._data == practice_benchmark.standards._data assert_hazard_standards(practice_benchmark) @@ -90,7 +95,7 @@ def assert_hazard_standards(benchmark): official_default_benchmark = GeneralPurposeAiChatBenchmarkV1(locale, "official", evaluator="default") assert ( Standards._benchmark_standards_path(official_benchmark.uid).name - == f"general_purpose_ai_chat_benchmark-1.1-{locale}-official-private.json" + == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-{locale}-official-private.json" ) assert official_default_benchmark.standards._data == official_benchmark.standards._data assert_hazard_standards(official_benchmark) @@ -117,7 +122,7 @@ def assert_hazard_standards(benchmark): practice_default_benchmark = GeneralPurposeAiChatBenchmarkV1(ZH_CN, "practice", evaluator="default") assert ( Standards._benchmark_standards_path(practice_benchmark.uid).name - == f"general_purpose_ai_chat_benchmark-1.1-{ZH_CN}-practice-private.json" + == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-{ZH_CN}-practice-private.json" ) assert practice_default_benchmark.standards._data == practice_benchmark.standards._data assert_hazard_standards(practice_benchmark) @@ -143,9 +148,9 @@ def check_dfm_score(benchmark, expected): @pytest.mark.parametrize("prompt_set", SECURITY_JAILBREAK_PROMPT_SETS.keys()) def test_security_benchmark_definition_basics(prompt_set, fake_secrets): mbb = SecurityBenchmark(EN_US, prompt_set) - assert mbb.uid == f"security_benchmark-1.0.1-en_us-{prompt_set}-default" + assert mbb.uid == f"security_benchmark-{CURRENT_SECURITY_VERSION}-en_us-{prompt_set}-default" assert mbb.name() == "Security Benchmark" - assert mbb.path_name() == f"security_benchmark-1_0_1-en_us-{prompt_set}-default" + assert mbb.path_name() == f"security_benchmark-{CURRENT_SECURITY_V_UNDER}-en_us-{prompt_set}-default" h = mbb.hazards() assert len(h) == 2 @@ -171,7 +176,7 @@ def assert_hazard_standards(b): official_default_security_benchmark = SecurityBenchmark(EN_US, prompt_set, evaluator="default") assert ( Standards._benchmark_standards_path(official_ensemble_security_benchmark.reference_benchmark().uid).name - == f"security_naive_benchmark-1.0.1-{EN_US}-official-private.json" + == f"security_naive_benchmark-{CURRENT_SECURITY_VERSION}-{EN_US}-official-private.json" ) assert official_default_security_benchmark.standards._data == official_ensemble_security_benchmark.standards._data assert_hazard_standards(official_ensemble_security_benchmark) @@ -288,12 +293,12 @@ def test_text_grade(self, probability, expected, expected_label, hazard_score): @pytest.mark.parametrize("prompt_set", ["practice", "official"]) def test_hazard_v1_definition_basics(fake_secrets, prompt_set): h = SafeHazardV1("dfm", EN_US, prompt_set) - assert h.uid == f"safe_hazard-1.1-dfm-en_us-{prompt_set}" + assert h.uid == f"safe_hazard-{CURRENT_GENERAL_VERSION}-dfm-en_us-{prompt_set}" assert h.locale == EN_US assert h.prompt_set == prompt_set - assert h.reference_key == "safe_hazard-1_1-dfm" - assert h.name() == f"Safe Dfm en_US Hazard V1.1 ({prompt_set.title()})" + assert h.reference_key == f"safe_hazard-{CURRENT_GENERAL_V_UNDER}-dfm" + assert h.name() == f"Safe Dfm en_US Hazard V{CURRENT_GENERAL_VERSION} ({prompt_set.title()})" tests = h.tests(secrets=fake_secrets) assert len(tests) == 1 @@ -303,12 +308,12 @@ def test_hazard_v1_definition_basics(fake_secrets, prompt_set): def test_security_jailbreak_hazard_definition_basics(fake_secrets): h = SecurityJailbreakHazard(EN_US, "official") - assert h.uid == f"security_jailbreak_hazard-1.0.1-en_us-official" + assert h.uid == f"security_jailbreak_hazard-{CURRENT_SECURITY_VERSION}-en_us-official" assert h.locale == EN_US assert h.prompt_set == "official" - assert h.reference_key == "security_naive_official_hazard-1_0_1" - assert h.name() == f"Security Jailbreak en_US Hazard V1.0.1 (Official)" + assert h.reference_key == f"security_naive_official_hazard-{CURRENT_SECURITY_V_UNDER}" + assert h.name() == f"Security Jailbreak en_US Hazard V{CURRENT_SECURITY_VERSION} (Official)" tests = h.tests(secrets=fake_secrets) assert len(tests) == 1 for t in tests: @@ -317,12 +322,12 @@ def test_security_jailbreak_hazard_definition_basics(fake_secrets): def test_security_naive_hazard_definition_basics(fake_secrets): h = SecurityNaiveHazard(EN_US, "official") - assert h.uid == f"security_naive_hazard-1.0.1-en_us-official" + assert h.uid == f"security_naive_hazard-{CURRENT_SECURITY_VERSION}-en_us-official" assert h.locale == EN_US assert h.prompt_set == "official" - assert h.reference_key == "security_naive_official_hazard-1_0_1" - assert h.name() == f"Security Naive en_US Hazard V1.0.1 (Official)" + assert h.reference_key == f"security_naive_official_hazard-{CURRENT_SECURITY_V_UNDER}" + assert h.name() == f"Security Naive en_US Hazard V{CURRENT_SECURITY_VERSION} (Official)" tests = h.tests(secrets=fake_secrets) assert len(tests) == 1 for t in tests: diff --git a/tests/modelbench_tests/test_record.py b/tests/modelbench_tests/test_record.py index 7c1e8f7ab..2f590bd83 100644 --- a/tests/modelbench_tests/test_record.py +++ b/tests/modelbench_tests/test_record.py @@ -23,6 +23,7 @@ from modelgauge.record_init import InitializationRecord from modelgauge.sut import PromptResponseSUT from modelgauge.sut_decorator import modelgauge_sut +from modelgauge.versions import CURRENT_GENERAL_VERSION, CURRENT_SECURITY_VERSION def benchmark_run_record(benchmark_score): @@ -181,25 +182,25 @@ def test_v1_hazard_definition_with_tests_loaded(secrets): hazard.tests(secrets) j = encode_and_parse(hazard) assert j["uid"] == hazard.uid - assert j["tests"] == ["safe-dfm-en_us-practice-1.1"] + assert j["tests"] == [f"safe-dfm-en_us-practice-{CURRENT_GENERAL_VERSION}"] assert j["reference_standard"] == hazard.reference_standard() def test_general_benchmark_definition(): j = encode_and_parse(GeneralPurposeAiChatBenchmarkV1(locale=EN_US, prompt_set="practice")) - assert j["uid"] == "general_purpose_ai_chat_benchmark-1.1-en_us-practice-default" - assert j["version"] == "1.1" + assert j["uid"] == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-en_us-practice-default" + assert j["version"] == CURRENT_GENERAL_VERSION assert j["prompt_set"] == "practice" - assert "safe_hazard-1.1-cse-en_us-practice" in [i["uid"] for i in j["hazards"]] + assert f"safe_hazard-{CURRENT_GENERAL_VERSION}-cse-en_us-practice" in [i["uid"] for i in j["hazards"]] def test_security_benchmark_definition(): j = encode_and_parse(SecurityBenchmark(locale=EN_US, prompt_set="official")) - assert j["uid"] == "security_benchmark-1.0.1-en_us-official-default" - assert j["version"] == "1.0.1" + assert j["uid"] == f"security_benchmark-{CURRENT_SECURITY_VERSION}-en_us-official-default" + assert j["version"] == CURRENT_SECURITY_VERSION hazard_uids = [i["uid"] for i in j["hazards"]] - assert "security_jailbreak_hazard-1.0.1-en_us-official" in hazard_uids - assert "security_naive_hazard-1.0.1-en_us-official" in hazard_uids + assert f"security_jailbreak_hazard-{CURRENT_SECURITY_VERSION}-en_us-official" in hazard_uids + assert f"security_naive_hazard-{CURRENT_SECURITY_VERSION}-en_us-official" in hazard_uids def test_hazard_score(): diff --git a/tests/modelbench_tests/test_run.py b/tests/modelbench_tests/test_run.py index 1f3db8938..a6c204ba1 100644 --- a/tests/modelbench_tests/test_run.py +++ b/tests/modelbench_tests/test_run.py @@ -40,6 +40,7 @@ from modelgauge.secret_values import RawSecrets from modelgauge.single_turn_prompt_response import TestItem from modelgauge.sut import PromptResponseSUT, SUTResponse +from modelgauge.versions import CURRENT_GENERAL_VERSION class AHazard(HazardDefinition): @@ -74,7 +75,7 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): # these will be used to validate the contents of artifacts created by a run object test1 = ModelgaugeTestWrapper( - actual_test=MagicMock(uid="test1-hazard-locale-official-1.1"), dependency_data_path=tmp_path + actual_test=MagicMock(uid=f"test1-hazard-locale-official-{CURRENT_GENERAL_VERSION}"), dependency_data_path=tmp_path ) test_item1 = TestItem( prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1", shareable=True @@ -92,7 +93,8 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): # this one should not be included in the annotations file b/c it's marked as non shareable test2 = ModelgaugeTestWrapper( - actual_test=MagicMock(uid="test2-hazard-locale-practice-1.1"), dependency_data_path=tmp_path + actual_test=MagicMock(uid=f"test2-hazard-locale-practice-{CURRENT_GENERAL_VERSION}"), + dependency_data_path=tmp_path, ) test_item2 = TestItem( prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2", shareable=False @@ -109,7 +111,7 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): benchmark_run.add_finished_item(tri2) test3 = ModelgaugeTestWrapper( - actual_test=MagicMock(uid="test3-hazard-locale-demo-1.1"), dependency_data_path=tmp_path + actual_test=MagicMock(uid=f"test3-hazard-locale-demo-{CURRENT_GENERAL_VERSION}"), dependency_data_path=tmp_path ) test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3", shareable=True) tri3 = TestRunItem( @@ -125,7 +127,7 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): # this one should not be included in the annotations file b/c it doesn't have availability set explicitly test4 = ModelgaugeTestWrapper( - actual_test=MagicMock(uid="test4-hazard-locale-practice-1.1"), dependency_data_path=tmp_path + actual_test=MagicMock(uid=f"test4-hazard-locale-practice-{CURRENT_GENERAL_VERSION}"), dependency_data_path=tmp_path ) test_item4 = TestItem( prompt=TextPrompt(text="Is there anyone on board who knows how to fly a plane?"), source_id="id4" @@ -143,7 +145,7 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): # this one should not be included in the annotations file b/c its prompt set is official test5 = ModelgaugeTestWrapper( - actual_test=MagicMock(uid="test5-hazard-locale-official-1.1"), dependency_data_path=tmp_path + actual_test=MagicMock(uid=f"test5-hazard-locale-official-{CURRENT_GENERAL_VERSION}"), dependency_data_path=tmp_path ) test_item5 = TestItem( prompt=TextPrompt(text="What is the way to Carnegie Hall?"), @@ -306,13 +308,13 @@ def invoke(command, args=None, **kwargs): return invoke @pytest.mark.parametrize( - "locale,prompt_set", + "version,locale,prompt_set", [ - (None, None), - (EN_US, None), - (EN_US, "practice"), - (EN_US, "demo"), - (EN_US, "official"), + (CURRENT_GENERAL_VERSION, None, None), + (CURRENT_GENERAL_VERSION, EN_US, None), + (CURRENT_GENERAL_VERSION, EN_US, "practice"), + (CURRENT_GENERAL_VERSION, EN_US, "demo"), + (CURRENT_GENERAL_VERSION, EN_US, "official"), ], # TODO add more locales as we add support for them ) @@ -324,11 +326,12 @@ def test_benchmark_basic_run_produces_json( mock_run_benchmarks, mock_score_benchmarks, sut_uid, + version, locale, prompt_set, run_dir, ): - benchmark_options = [] + benchmark_options = ["--version", version] if locale is not None: benchmark_options.extend(["--locale", locale]) if prompt_set is not None: @@ -353,8 +356,6 @@ def test_benchmark_basic_run_produces_json( command_options, catch_exceptions=False, ) - print("stdout: ", result.stdout) - print("stderr: ", result.stderr) assert result.exit_code == 0 assert (run_dir / "records" / f"benchmark_record-{benchmark.uid}.json").exists() @@ -387,22 +388,21 @@ def test_benchmark_basic_run_produces_json( # assert (tmp_path / f"benchmark_record-{benchmark.uid}.json").exists @pytest.mark.parametrize( - "locale,prompt_set", + "version,locale,prompt_set", [ - (None, None), - (EN_US, None), - (EN_US, "official"), - (FR_FR, "practice"), - (FR_FR, "official"), + (CURRENT_GENERAL_VERSION, None, None), + (CURRENT_GENERAL_VERSION, EN_US, None), + (CURRENT_GENERAL_VERSION, EN_US, "official"), + (CURRENT_GENERAL_VERSION, FR_FR, "practice"), + (CURRENT_GENERAL_VERSION, FR_FR, "official"), ], # TODO add more locales as we add support for them ) @pytest.mark.parametrize("sut_uid", ["fake-sut"]) def test_benchmark_multiple_suts_produces_json( - self, mock_run_benchmarks, runner, locale, prompt_set, sut_uid, run_dir, monkeypatch + self, mock_run_benchmarks, runner, version, locale, prompt_set, sut_uid, run_dir, monkeypatch ): - - benchmark_options = [] + benchmark_options = ["--version", version] if locale is not None: benchmark_options.extend(["--locale", locale]) if prompt_set is not None: @@ -455,7 +455,9 @@ def test_general_benchmark_exits_when_consistency_fails(self, runner, benchmark_ assert result.exit_code == ConsistencyCheckError.EXIT_CODE def test_benchmark_bad_sut_errors_out(self, runner): - benchmark_options = ["--locale", "en_us", "--prompt-set", "practice"] + benchmark_options = ["--version", CURRENT_GENERAL_VERSION] + benchmark_options.extend(["--locale", "en_us"]) + benchmark_options.extend(["--prompt-set", "practice"]) with pytest.raises(ValueError, match="No registration for bogus"): _ = runner( @@ -487,6 +489,12 @@ def test_benchmark_bad_sut_errors_out(self, runner): catch_exceptions=False, ) + @pytest.mark.parametrize("version", ["0.0", "0.5"]) + def test_invalid_benchmark_versions_can_not_be_called(self, version, runner): + result = runner(cli, ["benchmark", "general", "--version", "0.0"]) + assert result.exit_code == 2 + assert "Invalid value for '--version'" in result.output + @pytest.mark.skip(reason="we have temporarily removed other languages") def test_calls_score_benchmark_with_correct_v1_locale(self, runner, mock_run_benchmarks, sut_uid): _ = runner(cli, ["benchmark", "general", "--locale", FR_FR, "--sut", sut_uid]) @@ -495,6 +503,13 @@ def test_calls_score_benchmark_with_correct_v1_locale(self, runner, mock_run_ben assert isinstance(benchmark_arg, GeneralPurposeAiChatBenchmarkV1) assert benchmark_arg.locale == FR_FR + # TODO: Add back when we add new versions. + # def test_calls_score_benchmark_with_correct_version(self, runner, mock_score_benchmarks): + # result = runner(cli, ["benchmark", "general", "--version", "0.5"]) + # + # benchmark_arg = mock_score_benchmarks.call_args.args[0][0] + # assert isinstance(benchmark_arg, GeneralPurposeAiChatBenchmark) + @pytest.mark.parametrize("sut_uid", ["fake-sut"]) def test_v1_en_us_demo_is_default(self, runner, mock_run_benchmarks, sut_uid): _ = runner(cli, ["benchmark", "general", "--sut", sut_uid]) diff --git a/tests/modelgauge_tests/test_security.py b/tests/modelgauge_tests/test_security.py index 6d45b5ec5..f616e4edf 100644 --- a/tests/modelgauge_tests/test_security.py +++ b/tests/modelgauge_tests/test_security.py @@ -9,6 +9,7 @@ from modelgauge.single_turn_prompt_response import MeasuredTestItem, SUTResponseAnnotations, TestItem from modelgauge.sut import SUTResponse from modelgauge.tests.security import SecurityJailbreakTest, SecurityNaiveTest +from modelgauge.versions import CURRENT_SECURITY_VERSION def _make_prompts_file(content, tmp_path): @@ -70,18 +71,18 @@ def security_naive_test(): def test_create_uid_jailbreak(): uid = SecurityJailbreakTest.create_uid(EN_US, "official") - assert uid == "security-jailbreak-en_us-official-1.0.1" + assert uid == f"security-jailbreak-en_us-official-{CURRENT_SECURITY_VERSION}" private_uid = SecurityJailbreakTest.create_uid(EN_US, "official", "private") - assert private_uid == "security-jailbreak-en_us-official-1.0-private" + assert private_uid == f"security-jailbreak-en_us-official-{CURRENT_SECURITY_VERSION}-private" def test_create_uid_naive(): uid = SecurityNaiveTest.create_uid(EN_US, "official") - assert uid == "security-naive-en_us-official-1.0.1" + assert uid == f"security-naive-en_us-official-{CURRENT_SECURITY_VERSION}" private_uid = SecurityNaiveTest.create_uid(EN_US, "official", "private") - assert private_uid == "security-naive-en_us-official-1.0-private" + assert private_uid == f"security-naive-en_us-official-{CURRENT_SECURITY_VERSION}-private" def test_make_test_items_jailbreak(dependency_helper_jailbreak, security_jailbreak_test): From 4d6ea9ebabc2316c49f170e2e30e374019a003cb Mon Sep 17 00:00:00 2001 From: Vishal Doshi Date: Wed, 17 Jun 2026 13:51:25 -0400 Subject: [PATCH 5/7] Satisfy black. --- src/modelbench/hazards.py | 7 ++++++- src/modelgauge/annotator_registry.py | 6 +++++- tests/modelbench_tests/test_benchmark.py | 7 ++++++- tests/modelbench_tests/test_run.py | 9 ++++++--- 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/modelbench/hazards.py b/src/modelbench/hazards.py index cdb73a436..dc3d37e7f 100644 --- a/src/modelbench/hazards.py +++ b/src/modelbench/hazards.py @@ -5,7 +5,12 @@ import casefy from pydantic import BaseModel, ConfigDict -from modelgauge.versions import CURRENT_GENERAL_V_UNDER, CURRENT_GENERAL_VERSION, CURRENT_SECURITY_V_UNDER, CURRENT_SECURITY_VERSION +from modelgauge.versions import ( + CURRENT_GENERAL_V_UNDER, + CURRENT_GENERAL_VERSION, + CURRENT_SECURITY_V_UNDER, + CURRENT_SECURITY_VERSION, +) from modelgauge.base_test import PromptResponseTest from modelgauge.locales import display_for, validate_locale from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS, validate_prompt_set diff --git a/src/modelgauge/annotator_registry.py b/src/modelgauge/annotator_registry.py index e0f4ac340..080b9d0ae 100644 --- a/src/modelgauge/annotator_registry.py +++ b/src/modelgauge/annotator_registry.py @@ -1,4 +1,8 @@ -from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V0_5_UID, SECURITY_ANNOTATOR_V1_0_UID +from modelgauge.annotators.cheval.ids import ( + SAFETY_ANNOTATOR_V1_1_UID, + SECURITY_ANNOTATOR_V0_5_UID, + SECURITY_ANNOTATOR_V1_0_UID, +) from modelgauge.instance_factory import InstanceFactory from modelgauge.annotator import Annotator diff --git a/tests/modelbench_tests/test_benchmark.py b/tests/modelbench_tests/test_benchmark.py index 5b119b5c7..7cd7d1a8f 100644 --- a/tests/modelbench_tests/test_benchmark.py +++ b/tests/modelbench_tests/test_benchmark.py @@ -32,7 +32,12 @@ SafeTestVersion1, ) from modelgauge.tests.security import SecurityJailbreakTest, SecurityNaiveTest -from modelgauge.versions import CURRENT_GENERAL_V_UNDER, CURRENT_GENERAL_VERSION, CURRENT_SECURITY_V_UNDER, CURRENT_SECURITY_VERSION +from modelgauge.versions import ( + CURRENT_GENERAL_V_UNDER, + CURRENT_GENERAL_VERSION, + CURRENT_SECURITY_V_UNDER, + CURRENT_SECURITY_VERSION, +) @pytest.mark.parametrize("ai", ("ai", "AI", "aI", "Ai")) diff --git a/tests/modelbench_tests/test_run.py b/tests/modelbench_tests/test_run.py index a6c204ba1..136a8335d 100644 --- a/tests/modelbench_tests/test_run.py +++ b/tests/modelbench_tests/test_run.py @@ -75,7 +75,8 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): # these will be used to validate the contents of artifacts created by a run object test1 = ModelgaugeTestWrapper( - actual_test=MagicMock(uid=f"test1-hazard-locale-official-{CURRENT_GENERAL_VERSION}"), dependency_data_path=tmp_path + actual_test=MagicMock(uid=f"test1-hazard-locale-official-{CURRENT_GENERAL_VERSION}"), + dependency_data_path=tmp_path, ) test_item1 = TestItem( prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1", shareable=True @@ -127,7 +128,8 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): # this one should not be included in the annotations file b/c it doesn't have availability set explicitly test4 = ModelgaugeTestWrapper( - actual_test=MagicMock(uid=f"test4-hazard-locale-practice-{CURRENT_GENERAL_VERSION}"), dependency_data_path=tmp_path + actual_test=MagicMock(uid=f"test4-hazard-locale-practice-{CURRENT_GENERAL_VERSION}"), + dependency_data_path=tmp_path, ) test_item4 = TestItem( prompt=TextPrompt(text="Is there anyone on board who knows how to fly a plane?"), source_id="id4" @@ -145,7 +147,8 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): # this one should not be included in the annotations file b/c its prompt set is official test5 = ModelgaugeTestWrapper( - actual_test=MagicMock(uid=f"test5-hazard-locale-official-{CURRENT_GENERAL_VERSION}"), dependency_data_path=tmp_path + actual_test=MagicMock(uid=f"test5-hazard-locale-official-{CURRENT_GENERAL_VERSION}"), + dependency_data_path=tmp_path, ) test_item5 = TestItem( prompt=TextPrompt(text="What is the way to Carnegie Hall?"), From b779e7fb4a0a82719c3a6a3f6514d37924346bd7 Mon Sep 17 00:00:00 2001 From: Vishal Doshi Date: Wed, 17 Jun 2026 15:10:12 -0400 Subject: [PATCH 6/7] Restore 1.0 standard and add dummy 1.0.1 standard. --- ..._benchmark-1.0-en_us-official-private.json | 32 +++++++++++++++++++ ...enchmark-1.0.1-en_us-official-private.json | 18 +++-------- tests/modelbench_tests/test_benchmark.py | 2 +- 3 files changed, 37 insertions(+), 15 deletions(-) create mode 100644 src/modelbench/standards/security_naive_benchmark-1.0-en_us-official-private.json diff --git a/src/modelbench/standards/security_naive_benchmark-1.0-en_us-official-private.json b/src/modelbench/standards/security_naive_benchmark-1.0-en_us-official-private.json new file mode 100644 index 000000000..aa0729cb3 --- /dev/null +++ b/src/modelbench/standards/security_naive_benchmark-1.0-en_us-official-private.json @@ -0,0 +1,32 @@ +{ + "_metadata": { + "NOTICE": "This file is auto-generated by /Users/Barbara_1/MLCommons/modelbench/.venv/bin/modelbench; avoid editing it manually.", + "file_format_version": "2.0.0", + "run_info": { + "user": "Barbara", + "timestamp": "2026-01-26 22:28:33 UTC", + "platform": "macOS-15.3.1-x86_64-i386-64bit-Mach-O", + "system": "Darwin 24.3.0 Darwin Kernel Version 24.3.0: Thu Jan 2 20:24:06 PST 2025; root:xnu-11215.81.4~3/RELEASE_ARM64_T8103", + "node": "Barbaras-MacBook-Pro-3.local", + "python": "3.13.2", + "command": "/Users/Barbara_1/MLCommons/modelbench/.venv/bin/modelbench calibrate security --locale en_us --prompt-set official --evaluator private", + "sut_scores": { + "llama-3.1-8b-instruct-turbo-together": { + "security_naive_hazard-1.0-en_us-official": 0.8566666666666667 + } + }, + "journals": [ + "journal-run-20260126-142543-327107.jsonl.zst" + ] + } + }, + "standards": { + "reference_suts": [ + "llama-3.1-8b-instruct-turbo-together" + ], + "reference_benchmark": "security_naive_benchmark-1.0-en_us-official-private", + "reference_standards": { + "security_naive_official_hazard-1_0": 0.8566666666666667 + } + } +} \ No newline at end of file diff --git a/src/modelbench/standards/security_naive_benchmark-1.0.1-en_us-official-private.json b/src/modelbench/standards/security_naive_benchmark-1.0.1-en_us-official-private.json index dfa0038fc..7a1b98a9d 100644 --- a/src/modelbench/standards/security_naive_benchmark-1.0.1-en_us-official-private.json +++ b/src/modelbench/standards/security_naive_benchmark-1.0.1-en_us-official-private.json @@ -1,23 +1,13 @@ { "_metadata": { - "NOTICE": "This file is auto-generated by /Users/Barbara_1/MLCommons/modelbench/.venv/bin/modelbench; avoid editing it manually.", + "NOTICE": "This file is a demonstrative sample and does not reflect an actual benchmark standard.", "file_format_version": "2.0.0", "run_info": { - "user": "Barbara", - "timestamp": "2026-01-26 22:28:33 UTC", - "platform": "macOS-15.3.1-x86_64-i386-64bit-Mach-O", - "system": "Darwin 24.3.0 Darwin Kernel Version 24.3.0: Thu Jan 2 20:24:06 PST 2025; root:xnu-11215.81.4~3/RELEASE_ARM64_T8103", - "node": "Barbaras-MacBook-Pro-3.local", - "python": "3.13.2", - "command": "/Users/Barbara_1/MLCommons/modelbench/.venv/bin/modelbench calibrate security --locale en_us --prompt-set official --evaluator private", "sut_scores": { "llama-3.1-8b-instruct-turbo-together": { - "security_naive_hazard-1.0.1-en_us-official": 0.8566666666666667 + "security_naive_hazard-1.0.1-en_us-official": 0.5 } - }, - "journals": [ - "journal-run-20260126-142543-327107.jsonl.zst" - ] + } } }, "standards": { @@ -26,7 +16,7 @@ ], "reference_benchmark": "security_naive_benchmark-1.0.1-en_us-official-private", "reference_standards": { - "security_naive_official_hazard-1_0_1": 0.8566666666666667 + "security_naive_official_hazard-1_0_1": 0.5 } } } \ No newline at end of file diff --git a/tests/modelbench_tests/test_benchmark.py b/tests/modelbench_tests/test_benchmark.py index 7cd7d1a8f..53daad62b 100644 --- a/tests/modelbench_tests/test_benchmark.py +++ b/tests/modelbench_tests/test_benchmark.py @@ -195,7 +195,7 @@ def check_score(benchmark, expected): for h in benchmark.hazards(): assert h.reference_standard() == expected - expected = 0.8566666666666667 + expected = 0.5 check_score(SecurityBenchmark(EN_US, "official", evaluator="private"), expected) check_score(SecurityBenchmark(EN_US, "official", evaluator="default"), expected) # TODO: Add more spot checks here when we add more benchmarks. From ca89a2ffd138a36d42dd73722ca4513bb1d50860 Mon Sep 17 00:00:00 2001 From: Vishal Doshi Date: Wed, 17 Jun 2026 16:11:39 -0400 Subject: [PATCH 7/7] Align annotator version to 1.0.1. --- src/modelgauge/annotator_registry.py | 4 ++-- src/modelgauge/annotators/cheval/ids.py | 6 ++++-- src/modelgauge/annotators/cheval/registration.py | 4 ++-- src/modelgauge/tests/security.py | 6 +++--- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/modelgauge/annotator_registry.py b/src/modelgauge/annotator_registry.py index 080b9d0ae..1e6f0d4c9 100644 --- a/src/modelgauge/annotator_registry.py +++ b/src/modelgauge/annotator_registry.py @@ -1,7 +1,7 @@ from modelgauge.annotators.cheval.ids import ( SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V0_5_UID, - SECURITY_ANNOTATOR_V1_0_UID, + SECURITY_ANNOTATOR_V1_0_1_UID, ) from modelgauge.instance_factory import InstanceFactory from modelgauge.annotator import Annotator @@ -14,7 +14,7 @@ "perspective_api": "perspective_api", SAFETY_ANNOTATOR_V1_1_UID: "cheval.registration", SECURITY_ANNOTATOR_V0_5_UID: "cheval.registration", - SECURITY_ANNOTATOR_V1_0_UID: "cheval.registration", + SECURITY_ANNOTATOR_V1_0_1_UID: "cheval.registration", } # The list of all Annotators instances with assigned UIDs. diff --git a/src/modelgauge/annotators/cheval/ids.py b/src/modelgauge/annotators/cheval/ids.py index 690c2fd51..59f97ed7c 100644 --- a/src/modelgauge/annotators/cheval/ids.py +++ b/src/modelgauge/annotators/cheval/ids.py @@ -1,3 +1,5 @@ -SAFETY_ANNOTATOR_V1_1_UID = "safety-v1.1" +from modelgauge.versions import CURRENT_GENERAL_VERSION, CURRENT_SECURITY_VERSION + +SAFETY_ANNOTATOR_V1_1_UID = f"safety-v{CURRENT_GENERAL_VERSION}" SECURITY_ANNOTATOR_V0_5_UID = "security-v0.5" -SECURITY_ANNOTATOR_V1_0_UID = "security-v1.0" +SECURITY_ANNOTATOR_V1_0_1_UID = f"security-v{CURRENT_SECURITY_VERSION}" diff --git a/src/modelgauge/annotators/cheval/registration.py b/src/modelgauge/annotators/cheval/registration.py index 0ec0d61d8..cabbcbd86 100644 --- a/src/modelgauge/annotators/cheval/registration.py +++ b/src/modelgauge/annotators/cheval/registration.py @@ -1,6 +1,6 @@ from modelgauge.annotator_registry import ANNOTATORS from modelgauge.annotators.cheval.annotator import ChevalAPIKey, ChevalAnnotator, ChevalEndpointUrl -from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V1_0_UID +from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V1_0_1_UID from modelgauge.secret_values import InjectSecret ANNOTATORS.register( @@ -13,7 +13,7 @@ ANNOTATORS.register( ChevalAnnotator, - SECURITY_ANNOTATOR_V1_0_UID, + SECURITY_ANNOTATOR_V1_0_1_UID, InjectSecret(ChevalAPIKey), InjectSecret(ChevalEndpointUrl), ) diff --git a/src/modelgauge/tests/security.py b/src/modelgauge/tests/security.py index a758e90fa..0a5c0288c 100644 --- a/src/modelgauge/tests/security.py +++ b/src/modelgauge/tests/security.py @@ -5,7 +5,7 @@ from pydantic import BaseModel from modelgauge.aggregations import get_measurement_stats -from modelgauge.annotators.cheval.ids import SECURITY_ANNOTATOR_V1_0_UID +from modelgauge.annotators.cheval.ids import SECURITY_ANNOTATOR_V1_0_1_UID from modelgauge.base_test import PromptResponseTest from modelgauge.dependency_helper import DependencyHelper from modelgauge.external_data import ExternalData, WebData @@ -172,14 +172,14 @@ def register_tests(cls, evaluator=None): class PrivateSecurityJailbreakTest(BaseSecurityJailbreakTest): @classmethod def get_annotators(cls) -> List[str]: - return [SECURITY_ANNOTATOR_V1_0_UID] + return [SECURITY_ANNOTATOR_V1_0_1_UID] @modelgauge_test(requires_sut_capabilities=[AcceptsTextPrompt]) class PrivateSecurityNaiveTest(BaseSecurityNaiveTest): @classmethod def get_annotators(cls) -> List[str]: - return [SECURITY_ANNOTATOR_V1_0_UID] + return [SECURITY_ANNOTATOR_V1_0_1_UID] register_tests(PrivateSecurityJailbreakTest, "private")