diff --git a/src/modelbench/benchmarks.py b/src/modelbench/benchmarks.py index 99b2c12c..0257cedd 100644 --- a/src/modelbench/benchmarks.py +++ b/src/modelbench/benchmarks.py @@ -5,6 +5,8 @@ from typing import List, Sequence import casefy + +from modelgauge.versions import CURRENT_SECURITY_VERSION from modelgauge.locales import DEFAULT_LOCALE, validate_locale from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS, validate_prompt_set from modelgauge.sut import PromptResponseSUT @@ -255,7 +257,7 @@ def _make_hazards(self) -> Sequence[HazardDefinition]: _uid_definition = { "class": "security_naive_benchmark", - "version": "1.0", + "version": CURRENT_SECURITY_VERSION, "locale": "self.locale", "prompt_set": "self.prompt_set", "evaluator": "self.evaluator", @@ -289,7 +291,7 @@ def _make_hazards(self) -> Sequence[HazardDefinition]: _uid_definition = { "class": "security_benchmark", - "version": "1.0", + "version": CURRENT_SECURITY_VERSION, "locale": "self.locale", "prompt_set": "self.prompt_set", "evaluator": "self.evaluator", diff --git a/src/modelbench/cli.py b/src/modelbench/cli.py index f0b5cf03..9aa2e916 100644 --- a/src/modelbench/cli.py +++ b/src/modelbench/cli.py @@ -38,6 +38,7 @@ from modelgauge.preflight import check_secrets, make_sut from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS from modelgauge.sut_registry import SUTS +from modelgauge.versions import CURRENT_GENERAL_VERSION, CURRENT_SECURITY_VERSION def load_local_plugins(_, __, path: pathlib.Path): @@ -167,9 +168,9 @@ def list_suts(): @click.option( "--version", "-v", - type=click.Choice(["1.1"]), - default="1.1", - help="Benchmark version to run (Default: 1.1)", + type=click.Choice([CURRENT_GENERAL_VERSION]), + default=CURRENT_GENERAL_VERSION, + help=f"Benchmark version to run (Default: {CURRENT_GENERAL_VERSION})", multiple=False, ) @benchmark_options(GENERAL_PROMPT_SETS, "demo") @@ -200,10 +201,19 @@ def general_benchmark( @benchmark.command("security", help="run a security benchmark") +@click.option( + "--version", + "-v", + type=click.Choice([CURRENT_SECURITY_VERSION]), + default=CURRENT_SECURITY_VERSION, + help=f"Benchmark version to run (Default: {CURRENT_SECURITY_VERSION})", + multiple=False, +) @benchmark_options(SECURITY_JAILBREAK_PROMPT_SETS, "official") @click.pass_context def security_benchmark( ctx: click.Context, + version: str, output_dir: pathlib.Path, max_instances: int | None, debug: bool, diff --git a/src/modelbench/hazards.py b/src/modelbench/hazards.py index a4824b35..dc3d37e7 100644 --- a/src/modelbench/hazards.py +++ b/src/modelbench/hazards.py @@ -5,6 +5,12 @@ import casefy from pydantic import BaseModel, ConfigDict +from modelgauge.versions import ( + CURRENT_GENERAL_V_UNDER, + CURRENT_GENERAL_VERSION, + CURRENT_SECURITY_V_UNDER, + CURRENT_SECURITY_VERSION, +) from modelgauge.base_test import PromptResponseTest from modelgauge.locales import display_for, validate_locale from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS, validate_prompt_set @@ -90,10 +96,10 @@ def __init__(self, hazard_key: str, locale: str, prompt_set: str, evaluator="def @property def reference_key(self): - return f"safe_hazard-1_1-{self.hazard_key}" + return f"safe_hazard-{CURRENT_GENERAL_V_UNDER}-{self.hazard_key}" def name(self): - return f"Safe {self.hazard_key.title()} {display_for(self.locale)} Hazard V1.1 ({self.prompt_set.title()})" + return f"Safe {self.hazard_key.title()} {display_for(self.locale)} Hazard V{CURRENT_GENERAL_VERSION} ({self.prompt_set.title()})" def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore": test_scores: dict[str, ValueEstimate] = {} @@ -140,7 +146,7 @@ def test_uids(self) -> List[str]: _uid_definition = { "name": "safe_hazard", - "version": "1.1", + "version": CURRENT_GENERAL_VERSION, "hazard": "self.hazard_key", "locale": "self.locale", "prompt_set": "self.prompt_set", @@ -148,7 +154,7 @@ def test_uids(self) -> List[str]: class BaseSecurityHazard(HazardDefinition, ABC): - VERSION = "1.0" + VERSION = CURRENT_SECURITY_VERSION persona: str def __init__(self, locale: str, prompt_set: str, evaluator="default"): @@ -165,7 +171,7 @@ def test_uids(self) -> List[str]: @property def reference_key(self): # Both jailbreak and naive hazards use the naive reference score. - return f"security_naive_official_hazard-{self.VERSION.replace('.','_')}" + return f"security_naive_official_hazard-{CURRENT_SECURITY_V_UNDER}" def name(self): return f"Security {self.persona.title()} {display_for(self.locale)} Hazard V{self.VERSION} ({self.prompt_set.title()})" diff --git a/src/modelbench/standards/security_naive_benchmark-1.0.1-en_us-official-private.json b/src/modelbench/standards/security_naive_benchmark-1.0.1-en_us-official-private.json new file mode 100644 index 00000000..7a1b98a9 --- /dev/null +++ b/src/modelbench/standards/security_naive_benchmark-1.0.1-en_us-official-private.json @@ -0,0 +1,22 @@ +{ + "_metadata": { + "NOTICE": "This file is a demonstrative sample and does not reflect an actual benchmark standard.", + "file_format_version": "2.0.0", + "run_info": { + "sut_scores": { + "llama-3.1-8b-instruct-turbo-together": { + "security_naive_hazard-1.0.1-en_us-official": 0.5 + } + } + } + }, + "standards": { + "reference_suts": [ + "llama-3.1-8b-instruct-turbo-together" + ], + "reference_benchmark": "security_naive_benchmark-1.0.1-en_us-official-private", + "reference_standards": { + "security_naive_official_hazard-1_0_1": 0.5 + } + } +} \ No newline at end of file diff --git a/src/modelgauge/annotator_registry.py b/src/modelgauge/annotator_registry.py index 6cfad021..1e6f0d4c 100644 --- a/src/modelgauge/annotator_registry.py +++ b/src/modelgauge/annotator_registry.py @@ -1,4 +1,8 @@ -from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V0_5_UID +from modelgauge.annotators.cheval.ids import ( + SAFETY_ANNOTATOR_V1_1_UID, + SECURITY_ANNOTATOR_V0_5_UID, + SECURITY_ANNOTATOR_V1_0_1_UID, +) from modelgauge.instance_factory import InstanceFactory from modelgauge.annotator import Annotator @@ -10,6 +14,7 @@ "perspective_api": "perspective_api", SAFETY_ANNOTATOR_V1_1_UID: "cheval.registration", SECURITY_ANNOTATOR_V0_5_UID: "cheval.registration", + SECURITY_ANNOTATOR_V1_0_1_UID: "cheval.registration", } # The list of all Annotators instances with assigned UIDs. diff --git a/src/modelgauge/annotators/cheval/ids.py b/src/modelgauge/annotators/cheval/ids.py index 6ad15492..59f97ed7 100644 --- a/src/modelgauge/annotators/cheval/ids.py +++ b/src/modelgauge/annotators/cheval/ids.py @@ -1,2 +1,5 @@ -SAFETY_ANNOTATOR_V1_1_UID = "safety-v1.1" +from modelgauge.versions import CURRENT_GENERAL_VERSION, CURRENT_SECURITY_VERSION + +SAFETY_ANNOTATOR_V1_1_UID = f"safety-v{CURRENT_GENERAL_VERSION}" SECURITY_ANNOTATOR_V0_5_UID = "security-v0.5" +SECURITY_ANNOTATOR_V1_0_1_UID = f"security-v{CURRENT_SECURITY_VERSION}" diff --git a/src/modelgauge/annotators/cheval/registration.py b/src/modelgauge/annotators/cheval/registration.py index 068282f4..cabbcbd8 100644 --- a/src/modelgauge/annotators/cheval/registration.py +++ b/src/modelgauge/annotators/cheval/registration.py @@ -1,6 +1,6 @@ from modelgauge.annotator_registry import ANNOTATORS from modelgauge.annotators.cheval.annotator import ChevalAPIKey, ChevalAnnotator, ChevalEndpointUrl -from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V0_5_UID +from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V1_0_1_UID from modelgauge.secret_values import InjectSecret ANNOTATORS.register( @@ -13,7 +13,7 @@ ANNOTATORS.register( ChevalAnnotator, - SECURITY_ANNOTATOR_V0_5_UID, + SECURITY_ANNOTATOR_V1_0_1_UID, InjectSecret(ChevalAPIKey), InjectSecret(ChevalEndpointUrl), ) diff --git a/src/modelgauge/tests/safe_v1.py b/src/modelgauge/tests/safe_v1.py index e6366a2e..6c2b6968 100644 --- a/src/modelgauge/tests/safe_v1.py +++ b/src/modelgauge/tests/safe_v1.py @@ -29,6 +29,7 @@ from modelgauge.sut_capabilities import AcceptsTextPrompt from modelgauge.test_decorator import modelgauge_test from modelgauge.test_registry import TESTS +from modelgauge.versions import CURRENT_GENERAL_VERSION from pydantic import BaseModel @@ -98,6 +99,7 @@ class BaseSafeTestVersion1(PromptResponseTest, ABC): Test UIDs follow the form "safe-{ hazard acronym }-1.1" (e.g. safe-vcr-1.1). The hazard acronyms are as follows: """ + VERSION = CURRENT_GENERAL_VERSION hazards = Hazards() _sut_options = ModelOptions( max_tokens=3000, @@ -113,7 +115,7 @@ def create_uid(hazard: str, locale: str, prompt_set: str, evaluator=None): postfix = "" else: postfix = "-" + evaluator - uid = f"safe-{hazard}-{locale}-{prompt_set}-1.1{postfix}".lower() + uid = f"safe-{hazard}-{locale}-{prompt_set}-{CURRENT_GENERAL_VERSION}{postfix}".lower() return uid def __init__( diff --git a/src/modelgauge/tests/security.py b/src/modelgauge/tests/security.py index 26d6219d..0a5c0288 100644 --- a/src/modelgauge/tests/security.py +++ b/src/modelgauge/tests/security.py @@ -5,7 +5,7 @@ from pydantic import BaseModel from modelgauge.aggregations import get_measurement_stats -from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID +from modelgauge.annotators.cheval.ids import SECURITY_ANNOTATOR_V1_0_1_UID from modelgauge.base_test import PromptResponseTest from modelgauge.dependency_helper import DependencyHelper from modelgauge.external_data import ExternalData, WebData @@ -32,6 +32,7 @@ from modelgauge.tests.safe_v1 import Hazards from modelgauge.test_decorator import modelgauge_test from modelgauge.test_registry import TESTS +from modelgauge.versions import CURRENT_SECURITY_VERSION class SecurityTestResult(BaseModel): @@ -40,7 +41,7 @@ class SecurityTestResult(BaseModel): class BaseSecurityTest(PromptResponseTest, ABC): - VERSION = 1.0 + VERSION = CURRENT_SECURITY_VERSION hazards = Hazards() prompt_sets: dict persona: str @@ -171,14 +172,14 @@ def register_tests(cls, evaluator=None): class PrivateSecurityJailbreakTest(BaseSecurityJailbreakTest): @classmethod def get_annotators(cls) -> List[str]: - return [SAFETY_ANNOTATOR_V1_1_UID] + return [SECURITY_ANNOTATOR_V1_0_1_UID] @modelgauge_test(requires_sut_capabilities=[AcceptsTextPrompt]) class PrivateSecurityNaiveTest(BaseSecurityNaiveTest): @classmethod def get_annotators(cls) -> List[str]: - return [SAFETY_ANNOTATOR_V1_1_UID] + return [SECURITY_ANNOTATOR_V1_0_1_UID] register_tests(PrivateSecurityJailbreakTest, "private") diff --git a/src/modelgauge/versions.py b/src/modelgauge/versions.py new file mode 100644 index 00000000..ca7551f7 --- /dev/null +++ b/src/modelgauge/versions.py @@ -0,0 +1,7 @@ +_GENERAL_V1_0 = "1.1" +_SECURITY_V1_0_1 = "1.0.1" + +CURRENT_GENERAL_VERSION = _GENERAL_V1_0 +CURRENT_SECURITY_VERSION = _SECURITY_V1_0_1 +CURRENT_GENERAL_V_UNDER = CURRENT_GENERAL_VERSION.replace(".", "_") +CURRENT_SECURITY_V_UNDER = CURRENT_SECURITY_VERSION.replace(".", "_") diff --git a/tests/modelbench_tests/test_benchmark.py b/tests/modelbench_tests/test_benchmark.py index ae17ba9b..53daad62 100644 --- a/tests/modelbench_tests/test_benchmark.py +++ b/tests/modelbench_tests/test_benchmark.py @@ -11,16 +11,20 @@ GeneralPurposeAiChatBenchmarkV1, SecurityBenchmark, ) -from modelbench.hazards import HazardScore, SafeHazardV1, SecurityJailbreakHazard, SecurityNaiveHazard # usort: skip +from modelbench.hazards import ( # usort: skip + HazardScore, + SafeHazardV1, + SecurityJailbreakHazard, + SecurityNaiveHazard, +) from modelbench.scoring import ValueEstimate from modelbench.standards import Standards - from modelgauge.locales import EN_US, FR_FR, PUBLISHED_LOCALES, ZH_CN -from modelgauge.prompt_sets import ( +from modelgauge.prompt_sets import ( # usort: skip GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS, prompt_set_to_filename, -) # usort: skip +) from modelgauge.tests.safe_v1 import ( PersonaResult, SafePersonasVersion1, @@ -28,6 +32,12 @@ SafeTestVersion1, ) from modelgauge.tests.security import SecurityJailbreakTest, SecurityNaiveTest +from modelgauge.versions import ( + CURRENT_GENERAL_V_UNDER, + CURRENT_GENERAL_VERSION, + CURRENT_SECURITY_V_UNDER, + CURRENT_SECURITY_VERSION, +) @pytest.mark.parametrize("ai", ("ai", "AI", "aI", "Ai")) @@ -53,9 +63,9 @@ def test_capitalization_doesnt_overgeneralize(): @pytest.mark.parametrize("prompt_set", GENERAL_PROMPT_SETS.keys()) def test_benchmark_v1_definition_basics(prompt_set, fake_secrets): mbb = GeneralPurposeAiChatBenchmarkV1(EN_US, prompt_set) - assert mbb.uid == f"general_purpose_ai_chat_benchmark-1.1-en_us-{prompt_set}-default" + assert mbb.uid == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-en_us-{prompt_set}-default" assert mbb.name() == "General Purpose AI Chat Benchmark V 1" - assert mbb.path_name() == f"general_purpose_ai_chat_benchmark-1_1-en_us-{prompt_set}-default" + assert mbb.path_name() == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_V_UNDER}-en_us-{prompt_set}-default" h = mbb.hazards() all_hazard_keys = ["vcr", "ncr", "src", "cse", "dfm", "spc", "prv", "ipv", "iwp", "hte", "ssh", "sxc"] assert len(h) == len(all_hazard_keys) @@ -80,7 +90,7 @@ def assert_hazard_standards(benchmark): practice_default_benchmark = GeneralPurposeAiChatBenchmarkV1(locale, "practice", evaluator="default") assert ( Standards._benchmark_standards_path(practice_benchmark.uid).name - == f"general_purpose_ai_chat_benchmark-1.1-{locale}-practice-private.json" + == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-{locale}-practice-private.json" ) assert practice_default_benchmark.standards._data == practice_benchmark.standards._data assert_hazard_standards(practice_benchmark) @@ -90,7 +100,7 @@ def assert_hazard_standards(benchmark): official_default_benchmark = GeneralPurposeAiChatBenchmarkV1(locale, "official", evaluator="default") assert ( Standards._benchmark_standards_path(official_benchmark.uid).name - == f"general_purpose_ai_chat_benchmark-1.1-{locale}-official-private.json" + == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-{locale}-official-private.json" ) assert official_default_benchmark.standards._data == official_benchmark.standards._data assert_hazard_standards(official_benchmark) @@ -117,7 +127,7 @@ def assert_hazard_standards(benchmark): practice_default_benchmark = GeneralPurposeAiChatBenchmarkV1(ZH_CN, "practice", evaluator="default") assert ( Standards._benchmark_standards_path(practice_benchmark.uid).name - == f"general_purpose_ai_chat_benchmark-1.1-{ZH_CN}-practice-private.json" + == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-{ZH_CN}-practice-private.json" ) assert practice_default_benchmark.standards._data == practice_benchmark.standards._data assert_hazard_standards(practice_benchmark) @@ -143,9 +153,9 @@ def check_dfm_score(benchmark, expected): @pytest.mark.parametrize("prompt_set", SECURITY_JAILBREAK_PROMPT_SETS.keys()) def test_security_benchmark_definition_basics(prompt_set, fake_secrets): mbb = SecurityBenchmark(EN_US, prompt_set) - assert mbb.uid == f"security_benchmark-1.0-en_us-{prompt_set}-default" + assert mbb.uid == f"security_benchmark-{CURRENT_SECURITY_VERSION}-en_us-{prompt_set}-default" assert mbb.name() == "Security Benchmark" - assert mbb.path_name() == f"security_benchmark-1_0-en_us-{prompt_set}-default" + assert mbb.path_name() == f"security_benchmark-{CURRENT_SECURITY_V_UNDER}-en_us-{prompt_set}-default" h = mbb.hazards() assert len(h) == 2 @@ -171,7 +181,7 @@ def assert_hazard_standards(b): official_default_security_benchmark = SecurityBenchmark(EN_US, prompt_set, evaluator="default") assert ( Standards._benchmark_standards_path(official_ensemble_security_benchmark.reference_benchmark().uid).name - == f"security_naive_benchmark-1.0-{EN_US}-official-private.json" + == f"security_naive_benchmark-{CURRENT_SECURITY_VERSION}-{EN_US}-official-private.json" ) assert official_default_security_benchmark.standards._data == official_ensemble_security_benchmark.standards._data assert_hazard_standards(official_ensemble_security_benchmark) @@ -185,7 +195,7 @@ def check_score(benchmark, expected): for h in benchmark.hazards(): assert h.reference_standard() == expected - expected = 0.8566666666666667 + expected = 0.5 check_score(SecurityBenchmark(EN_US, "official", evaluator="private"), expected) check_score(SecurityBenchmark(EN_US, "official", evaluator="default"), expected) # TODO: Add more spot checks here when we add more benchmarks. @@ -288,12 +298,12 @@ def test_text_grade(self, probability, expected, expected_label, hazard_score): @pytest.mark.parametrize("prompt_set", ["practice", "official"]) def test_hazard_v1_definition_basics(fake_secrets, prompt_set): h = SafeHazardV1("dfm", EN_US, prompt_set) - assert h.uid == f"safe_hazard-1.1-dfm-en_us-{prompt_set}" + assert h.uid == f"safe_hazard-{CURRENT_GENERAL_VERSION}-dfm-en_us-{prompt_set}" assert h.locale == EN_US assert h.prompt_set == prompt_set - assert h.reference_key == "safe_hazard-1_1-dfm" - assert h.name() == f"Safe Dfm en_US Hazard V1.1 ({prompt_set.title()})" + assert h.reference_key == f"safe_hazard-{CURRENT_GENERAL_V_UNDER}-dfm" + assert h.name() == f"Safe Dfm en_US Hazard V{CURRENT_GENERAL_VERSION} ({prompt_set.title()})" tests = h.tests(secrets=fake_secrets) assert len(tests) == 1 @@ -303,12 +313,12 @@ def test_hazard_v1_definition_basics(fake_secrets, prompt_set): def test_security_jailbreak_hazard_definition_basics(fake_secrets): h = SecurityJailbreakHazard(EN_US, "official") - assert h.uid == f"security_jailbreak_hazard-1.0-en_us-official" + assert h.uid == f"security_jailbreak_hazard-{CURRENT_SECURITY_VERSION}-en_us-official" assert h.locale == EN_US assert h.prompt_set == "official" - assert h.reference_key == "security_naive_official_hazard-1_0" - assert h.name() == f"Security Jailbreak en_US Hazard V1.0 (Official)" + assert h.reference_key == f"security_naive_official_hazard-{CURRENT_SECURITY_V_UNDER}" + assert h.name() == f"Security Jailbreak en_US Hazard V{CURRENT_SECURITY_VERSION} (Official)" tests = h.tests(secrets=fake_secrets) assert len(tests) == 1 for t in tests: @@ -317,12 +327,12 @@ def test_security_jailbreak_hazard_definition_basics(fake_secrets): def test_security_naive_hazard_definition_basics(fake_secrets): h = SecurityNaiveHazard(EN_US, "official") - assert h.uid == f"security_naive_hazard-1.0-en_us-official" + assert h.uid == f"security_naive_hazard-{CURRENT_SECURITY_VERSION}-en_us-official" assert h.locale == EN_US assert h.prompt_set == "official" - assert h.reference_key == "security_naive_official_hazard-1_0" - assert h.name() == f"Security Naive en_US Hazard V1.0 (Official)" + assert h.reference_key == f"security_naive_official_hazard-{CURRENT_SECURITY_V_UNDER}" + assert h.name() == f"Security Naive en_US Hazard V{CURRENT_SECURITY_VERSION} (Official)" tests = h.tests(secrets=fake_secrets) assert len(tests) == 1 for t in tests: diff --git a/tests/modelbench_tests/test_record.py b/tests/modelbench_tests/test_record.py index 73cf4f89..2f590bd8 100644 --- a/tests/modelbench_tests/test_record.py +++ b/tests/modelbench_tests/test_record.py @@ -23,6 +23,7 @@ from modelgauge.record_init import InitializationRecord from modelgauge.sut import PromptResponseSUT from modelgauge.sut_decorator import modelgauge_sut +from modelgauge.versions import CURRENT_GENERAL_VERSION, CURRENT_SECURITY_VERSION def benchmark_run_record(benchmark_score): @@ -181,25 +182,25 @@ def test_v1_hazard_definition_with_tests_loaded(secrets): hazard.tests(secrets) j = encode_and_parse(hazard) assert j["uid"] == hazard.uid - assert j["tests"] == ["safe-dfm-en_us-practice-1.1"] + assert j["tests"] == [f"safe-dfm-en_us-practice-{CURRENT_GENERAL_VERSION}"] assert j["reference_standard"] == hazard.reference_standard() def test_general_benchmark_definition(): j = encode_and_parse(GeneralPurposeAiChatBenchmarkV1(locale=EN_US, prompt_set="practice")) - assert j["uid"] == "general_purpose_ai_chat_benchmark-1.1-en_us-practice-default" - assert j["version"] == "1.1" + assert j["uid"] == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-en_us-practice-default" + assert j["version"] == CURRENT_GENERAL_VERSION assert j["prompt_set"] == "practice" - assert "safe_hazard-1.1-cse-en_us-practice" in [i["uid"] for i in j["hazards"]] + assert f"safe_hazard-{CURRENT_GENERAL_VERSION}-cse-en_us-practice" in [i["uid"] for i in j["hazards"]] def test_security_benchmark_definition(): j = encode_and_parse(SecurityBenchmark(locale=EN_US, prompt_set="official")) - assert j["uid"] == "security_benchmark-1.0-en_us-official-default" - assert j["version"] == "1.0" + assert j["uid"] == f"security_benchmark-{CURRENT_SECURITY_VERSION}-en_us-official-default" + assert j["version"] == CURRENT_SECURITY_VERSION hazard_uids = [i["uid"] for i in j["hazards"]] - assert "security_jailbreak_hazard-1.0-en_us-official" in hazard_uids - assert "security_naive_hazard-1.0-en_us-official" in hazard_uids + assert f"security_jailbreak_hazard-{CURRENT_SECURITY_VERSION}-en_us-official" in hazard_uids + assert f"security_naive_hazard-{CURRENT_SECURITY_VERSION}-en_us-official" in hazard_uids def test_hazard_score(): diff --git a/tests/modelbench_tests/test_run.py b/tests/modelbench_tests/test_run.py index 09ffaa68..136a8335 100644 --- a/tests/modelbench_tests/test_run.py +++ b/tests/modelbench_tests/test_run.py @@ -40,6 +40,7 @@ from modelgauge.secret_values import RawSecrets from modelgauge.single_turn_prompt_response import TestItem from modelgauge.sut import PromptResponseSUT, SUTResponse +from modelgauge.versions import CURRENT_GENERAL_VERSION class AHazard(HazardDefinition): @@ -74,7 +75,8 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): # these will be used to validate the contents of artifacts created by a run object test1 = ModelgaugeTestWrapper( - actual_test=MagicMock(uid="test1-hazard-locale-official-1.1"), dependency_data_path=tmp_path + actual_test=MagicMock(uid=f"test1-hazard-locale-official-{CURRENT_GENERAL_VERSION}"), + dependency_data_path=tmp_path, ) test_item1 = TestItem( prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1", shareable=True @@ -92,7 +94,8 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): # this one should not be included in the annotations file b/c it's marked as non shareable test2 = ModelgaugeTestWrapper( - actual_test=MagicMock(uid="test2-hazard-locale-practice-1.1"), dependency_data_path=tmp_path + actual_test=MagicMock(uid=f"test2-hazard-locale-practice-{CURRENT_GENERAL_VERSION}"), + dependency_data_path=tmp_path, ) test_item2 = TestItem( prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2", shareable=False @@ -109,7 +112,7 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): benchmark_run.add_finished_item(tri2) test3 = ModelgaugeTestWrapper( - actual_test=MagicMock(uid="test3-hazard-locale-demo-1.1"), dependency_data_path=tmp_path + actual_test=MagicMock(uid=f"test3-hazard-locale-demo-{CURRENT_GENERAL_VERSION}"), dependency_data_path=tmp_path ) test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3", shareable=True) tri3 = TestRunItem( @@ -125,7 +128,8 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): # this one should not be included in the annotations file b/c it doesn't have availability set explicitly test4 = ModelgaugeTestWrapper( - actual_test=MagicMock(uid="test4-hazard-locale-practice-1.1"), dependency_data_path=tmp_path + actual_test=MagicMock(uid=f"test4-hazard-locale-practice-{CURRENT_GENERAL_VERSION}"), + dependency_data_path=tmp_path, ) test_item4 = TestItem( prompt=TextPrompt(text="Is there anyone on board who knows how to fly a plane?"), source_id="id4" @@ -143,7 +147,8 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path): # this one should not be included in the annotations file b/c its prompt set is official test5 = ModelgaugeTestWrapper( - actual_test=MagicMock(uid="test5-hazard-locale-official-1.1"), dependency_data_path=tmp_path + actual_test=MagicMock(uid=f"test5-hazard-locale-official-{CURRENT_GENERAL_VERSION}"), + dependency_data_path=tmp_path, ) test_item5 = TestItem( prompt=TextPrompt(text="What is the way to Carnegie Hall?"), @@ -308,11 +313,11 @@ def invoke(command, args=None, **kwargs): @pytest.mark.parametrize( "version,locale,prompt_set", [ - ("1.1", None, None), - ("1.1", EN_US, None), - ("1.1", EN_US, "practice"), - ("1.1", EN_US, "demo"), - ("1.1", EN_US, "official"), + (CURRENT_GENERAL_VERSION, None, None), + (CURRENT_GENERAL_VERSION, EN_US, None), + (CURRENT_GENERAL_VERSION, EN_US, "practice"), + (CURRENT_GENERAL_VERSION, EN_US, "demo"), + (CURRENT_GENERAL_VERSION, EN_US, "official"), ], # TODO add more locales as we add support for them ) @@ -388,11 +393,11 @@ def test_benchmark_basic_run_produces_json( @pytest.mark.parametrize( "version,locale,prompt_set", [ - ("1.1", None, None), - ("1.1", EN_US, None), - ("1.1", EN_US, "official"), - ("1.1", FR_FR, "practice"), - ("1.1", FR_FR, "official"), + (CURRENT_GENERAL_VERSION, None, None), + (CURRENT_GENERAL_VERSION, EN_US, None), + (CURRENT_GENERAL_VERSION, EN_US, "official"), + (CURRENT_GENERAL_VERSION, FR_FR, "practice"), + (CURRENT_GENERAL_VERSION, FR_FR, "official"), ], # TODO add more locales as we add support for them ) @@ -400,7 +405,6 @@ def test_benchmark_basic_run_produces_json( def test_benchmark_multiple_suts_produces_json( self, mock_run_benchmarks, runner, version, locale, prompt_set, sut_uid, run_dir, monkeypatch ): - benchmark_options = ["--version", version] if locale is not None: benchmark_options.extend(["--locale", locale]) @@ -454,7 +458,7 @@ def test_general_benchmark_exits_when_consistency_fails(self, runner, benchmark_ assert result.exit_code == ConsistencyCheckError.EXIT_CODE def test_benchmark_bad_sut_errors_out(self, runner): - benchmark_options = ["--version", "1.1"] + benchmark_options = ["--version", CURRENT_GENERAL_VERSION] benchmark_options.extend(["--locale", "en_us"]) benchmark_options.extend(["--prompt-set", "practice"]) @@ -508,6 +512,7 @@ def test_calls_score_benchmark_with_correct_v1_locale(self, runner, mock_run_ben # # benchmark_arg = mock_score_benchmarks.call_args.args[0][0] # assert isinstance(benchmark_arg, GeneralPurposeAiChatBenchmark) + @pytest.mark.parametrize("sut_uid", ["fake-sut"]) def test_v1_en_us_demo_is_default(self, runner, mock_run_benchmarks, sut_uid): _ = runner(cli, ["benchmark", "general", "--sut", sut_uid]) diff --git a/tests/modelgauge_tests/test_security.py b/tests/modelgauge_tests/test_security.py index 437ec692..f616e4ed 100644 --- a/tests/modelgauge_tests/test_security.py +++ b/tests/modelgauge_tests/test_security.py @@ -9,6 +9,7 @@ from modelgauge.single_turn_prompt_response import MeasuredTestItem, SUTResponseAnnotations, TestItem from modelgauge.sut import SUTResponse from modelgauge.tests.security import SecurityJailbreakTest, SecurityNaiveTest +from modelgauge.versions import CURRENT_SECURITY_VERSION def _make_prompts_file(content, tmp_path): @@ -70,18 +71,18 @@ def security_naive_test(): def test_create_uid_jailbreak(): uid = SecurityJailbreakTest.create_uid(EN_US, "official") - assert uid == "security-jailbreak-en_us-official-1.0" + assert uid == f"security-jailbreak-en_us-official-{CURRENT_SECURITY_VERSION}" private_uid = SecurityJailbreakTest.create_uid(EN_US, "official", "private") - assert private_uid == "security-jailbreak-en_us-official-1.0-private" + assert private_uid == f"security-jailbreak-en_us-official-{CURRENT_SECURITY_VERSION}-private" def test_create_uid_naive(): uid = SecurityNaiveTest.create_uid(EN_US, "official") - assert uid == "security-naive-en_us-official-1.0" + assert uid == f"security-naive-en_us-official-{CURRENT_SECURITY_VERSION}" private_uid = SecurityNaiveTest.create_uid(EN_US, "official", "private") - assert private_uid == "security-naive-en_us-official-1.0-private" + assert private_uid == f"security-naive-en_us-official-{CURRENT_SECURITY_VERSION}-private" def test_make_test_items_jailbreak(dependency_helper_jailbreak, security_jailbreak_test):