mlcommons · superdosh · Jun 16, 2026 · Jun 16, 2026 · Jun 17, 2026 · Jun 17, 2026
@@ -5,6 +5,8 @@
 from typing import List, Sequence
 
 import casefy
+
+from modelgauge.versions import CURRENT_SECURITY_VERSION
 from modelgauge.locales import DEFAULT_LOCALE, validate_locale
 from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS, validate_prompt_set
 from modelgauge.sut import PromptResponseSUT
@@ -255,7 +257,7 @@ def _make_hazards(self) -> Sequence[HazardDefinition]:
 
     _uid_definition = {
         "class": "security_naive_benchmark",
-        "version": "1.0",
+        "version": CURRENT_SECURITY_VERSION,
         "locale": "self.locale",
         "prompt_set": "self.prompt_set",
         "evaluator": "self.evaluator",
@@ -289,7 +291,7 @@ def _make_hazards(self) -> Sequence[HazardDefinition]:
 
     _uid_definition = {
         "class": "security_benchmark",
-        "version": "1.0",
+        "version": CURRENT_SECURITY_VERSION,
         "locale": "self.locale",
         "prompt_set": "self.prompt_set",
         "evaluator": "self.evaluator",

@@ -38,6 +38,7 @@
 from modelgauge.preflight import check_secrets, make_sut
 from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS
 from modelgauge.sut_registry import SUTS
+from modelgauge.versions import CURRENT_GENERAL_VERSION, CURRENT_SECURITY_VERSION
 
 
 def load_local_plugins(_, __, path: pathlib.Path):
@@ -167,9 +168,9 @@ def list_suts():
 @click.option(
     "--version",
     "-v",
-    type=click.Choice(["1.1"]),
-    default="1.1",
-    help="Benchmark version to run (Default: 1.1)",
+    type=click.Choice([CURRENT_GENERAL_VERSION]),
+    default=CURRENT_GENERAL_VERSION,
+    help=f"Benchmark version to run (Default: {CURRENT_GENERAL_VERSION})",
     multiple=False,
 )
 @benchmark_options(GENERAL_PROMPT_SETS, "demo")
@@ -200,10 +201,19 @@ def general_benchmark(
 
 
 @benchmark.command("security", help="run a security benchmark")
+@click.option(
+    "--version",
+    "-v",
+    type=click.Choice([CURRENT_SECURITY_VERSION]),
+    default=CURRENT_SECURITY_VERSION,
+    help=f"Benchmark version to run (Default: {CURRENT_SECURITY_VERSION})",
+    multiple=False,
+)
 @benchmark_options(SECURITY_JAILBREAK_PROMPT_SETS, "official")
 @click.pass_context
 def security_benchmark(
     ctx: click.Context,
+    version: str,
     output_dir: pathlib.Path,
     max_instances: int | None,
     debug: bool,

@@ -5,6 +5,7 @@
 import casefy
 from pydantic import BaseModel, ConfigDict
 
+from modelgauge.versions import CURRENT_GENERAL_V_UNDER, CURRENT_GENERAL_VERSION, CURRENT_SECURITY_V_UNDER, CURRENT_SECURITY_VERSION
 from modelgauge.base_test import PromptResponseTest
 from modelgauge.locales import display_for, validate_locale
 from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS, validate_prompt_set
@@ -90,10 +91,10 @@ def __init__(self, hazard_key: str, locale: str, prompt_set: str, evaluator="def
 
     @property
     def reference_key(self):
-        return f"safe_hazard-1_1-{self.hazard_key}"
+        return f"safe_hazard-{CURRENT_GENERAL_V_UNDER}-{self.hazard_key}"
 
     def name(self):
-        return f"Safe {self.hazard_key.title()} {display_for(self.locale)} Hazard V1.1 ({self.prompt_set.title()})"
+        return f"Safe {self.hazard_key.title()} {display_for(self.locale)} Hazard V{CURRENT_GENERAL_VERSION} ({self.prompt_set.title()})"
 
     def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore":
         test_scores: dict[str, ValueEstimate] = {}
@@ -140,15 +141,15 @@ def test_uids(self) -> List[str]:
 
     _uid_definition = {
         "name": "safe_hazard",
-        "version": "1.1",
+        "version": CURRENT_GENERAL_VERSION,
         "hazard": "self.hazard_key",
         "locale": "self.locale",
         "prompt_set": "self.prompt_set",
     }
 
 
 class BaseSecurityHazard(HazardDefinition, ABC):
-    VERSION = "1.0"
+    VERSION = CURRENT_SECURITY_VERSION
     persona: str
 
     def __init__(self, locale: str, prompt_set: str, evaluator="default"):
@@ -165,7 +166,7 @@ def test_uids(self) -> List[str]:
     @property
     def reference_key(self):
         # Both jailbreak and naive hazards use the naive reference score.
-        return f"security_naive_official_hazard-{self.VERSION.replace('.','_')}"
+        return f"security_naive_official_hazard-{CURRENT_SECURITY_V_UNDER}"
 
     def name(self):
         return f"Security {self.persona.title()} {display_for(self.locale)} Hazard V{self.VERSION} ({self.prompt_set.title()})"

@@ -12,7 +12,7 @@
             "command": "/Users/Barbara_1/MLCommons/modelbench/.venv/bin/modelbench calibrate security --locale en_us --prompt-set official --evaluator private",
             "sut_scores": {
                 "llama-3.1-8b-instruct-turbo-together": {
-                    "security_naive_hazard-1.0-en_us-official": 0.8566666666666667
+                    "security_naive_hazard-1.0.1-en_us-official": 0.8566666666666667
                 }
             },
             "journals": [
@@ -24,9 +24,9 @@
         "reference_suts": [
             "llama-3.1-8b-instruct-turbo-together"
         ],
-        "reference_benchmark": "security_naive_benchmark-1.0-en_us-official-private",
+        "reference_benchmark": "security_naive_benchmark-1.0.1-en_us-official-private",
         "reference_standards": {
-            "security_naive_official_hazard-1_0": 0.8566666666666667
+            "security_naive_official_hazard-1_0_1": 0.8566666666666667
         }
     }
 }
@@ -1,4 +1,4 @@
-from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V0_5_UID
+from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V0_5_UID, SECURITY_ANNOTATOR_V1_0_UID
 from modelgauge.instance_factory import InstanceFactory
 from modelgauge.annotator import Annotator
 
@@ -10,6 +10,7 @@
     "perspective_api": "perspective_api",
     SAFETY_ANNOTATOR_V1_1_UID: "cheval.registration",
     SECURITY_ANNOTATOR_V0_5_UID: "cheval.registration",
+    SECURITY_ANNOTATOR_V1_0_UID: "cheval.registration",
 }
 
 # The list of all Annotators instances with assigned UIDs.

@@ -1,2 +1,3 @@
 SAFETY_ANNOTATOR_V1_1_UID = "safety-v1.1"
 SECURITY_ANNOTATOR_V0_5_UID = "security-v0.5"
+SECURITY_ANNOTATOR_V1_0_UID = "security-v1.0"
@@ -1,6 +1,6 @@
 from modelgauge.annotator_registry import ANNOTATORS
 from modelgauge.annotators.cheval.annotator import ChevalAPIKey, ChevalAnnotator, ChevalEndpointUrl
-from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V0_5_UID
+from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V1_0_UID
 from modelgauge.secret_values import InjectSecret
 
 ANNOTATORS.register(
@@ -13,7 +13,7 @@
 
 ANNOTATORS.register(
     ChevalAnnotator,
-    SECURITY_ANNOTATOR_V0_5_UID,
+    SECURITY_ANNOTATOR_V1_0_UID,
     InjectSecret(ChevalAPIKey),
     InjectSecret(ChevalEndpointUrl),
 )
@@ -29,6 +29,7 @@
 from modelgauge.sut_capabilities import AcceptsTextPrompt
 from modelgauge.test_decorator import modelgauge_test
 from modelgauge.test_registry import TESTS
+from modelgauge.versions import CURRENT_GENERAL_VERSION
 from pydantic import BaseModel
 
 
@@ -98,6 +99,7 @@ class BaseSafeTestVersion1(PromptResponseTest, ABC):
     Test UIDs follow the form "safe-{ hazard acronym }-1.1" (e.g. safe-vcr-1.1). The hazard acronyms are as follows:
     """
 
+    VERSION = CURRENT_GENERAL_VERSION
     hazards = Hazards()
     _sut_options = ModelOptions(
         max_tokens=3000,
@@ -113,7 +115,7 @@ def create_uid(hazard: str, locale: str, prompt_set: str, evaluator=None):
             postfix = ""
         else:
             postfix = "-" + evaluator
-        uid = f"safe-{hazard}-{locale}-{prompt_set}-1.1{postfix}".lower()
+        uid = f"safe-{hazard}-{locale}-{prompt_set}-{CURRENT_GENERAL_VERSION}{postfix}".lower()
         return uid
 
     def __init__(

@@ -5,7 +5,7 @@
 from pydantic import BaseModel
 
 from modelgauge.aggregations import get_measurement_stats
-from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID
+from modelgauge.annotators.cheval.ids import SECURITY_ANNOTATOR_V1_0_UID
 from modelgauge.base_test import PromptResponseTest
 from modelgauge.dependency_helper import DependencyHelper
 from modelgauge.external_data import ExternalData, WebData
@@ -32,6 +32,7 @@
 from modelgauge.tests.safe_v1 import Hazards
 from modelgauge.test_decorator import modelgauge_test
 from modelgauge.test_registry import TESTS
+from modelgauge.versions import CURRENT_SECURITY_VERSION
 
 
 class SecurityTestResult(BaseModel):
@@ -40,7 +41,7 @@ class SecurityTestResult(BaseModel):
 
 
 class BaseSecurityTest(PromptResponseTest, ABC):
-    VERSION = 1.0
+    VERSION = CURRENT_SECURITY_VERSION
     hazards = Hazards()
     prompt_sets: dict
     persona: str
@@ -171,14 +172,14 @@ def register_tests(cls, evaluator=None):
 class PrivateSecurityJailbreakTest(BaseSecurityJailbreakTest):
     @classmethod
     def get_annotators(cls) -> List[str]:
-        return [SAFETY_ANNOTATOR_V1_1_UID]
+        return [SECURITY_ANNOTATOR_V1_0_UID]
 
 
 @modelgauge_test(requires_sut_capabilities=[AcceptsTextPrompt])
 class PrivateSecurityNaiveTest(BaseSecurityNaiveTest):
     @classmethod
     def get_annotators(cls) -> List[str]:
-        return [SAFETY_ANNOTATOR_V1_1_UID]
+        return [SECURITY_ANNOTATOR_V1_0_UID]
 
 
 register_tests(PrivateSecurityJailbreakTest, "private")

@@ -0,0 +1,7 @@
+_GENERAL_V1_0 = "1.1"
+_SECURITY_V1_0_1 = "1.0.1"
+
+CURRENT_GENERAL_VERSION = _GENERAL_V1_0
+CURRENT_SECURITY_VERSION = _SECURITY_V1_0_1
+CURRENT_GENERAL_V_UNDER = CURRENT_GENERAL_VERSION.replace(".", "_")
+CURRENT_SECURITY_V_UNDER = CURRENT_SECURITY_VERSION.replace(".", "_")
@@ -11,23 +11,28 @@
     GeneralPurposeAiChatBenchmarkV1,
     SecurityBenchmark,
 )
-from modelbench.hazards import HazardScore, SafeHazardV1, SecurityJailbreakHazard, SecurityNaiveHazard  # usort: skip
+from modelbench.hazards import (  # usort: skip
+    HazardScore,
+    SafeHazardV1,
+    SecurityJailbreakHazard,
+    SecurityNaiveHazard,
+)
 from modelbench.scoring import ValueEstimate
 from modelbench.standards import Standards
-
 from modelgauge.locales import EN_US, FR_FR, PUBLISHED_LOCALES, ZH_CN
-from modelgauge.prompt_sets import (
+from modelgauge.prompt_sets import (  # usort: skip
     GENERAL_PROMPT_SETS,
     SECURITY_JAILBREAK_PROMPT_SETS,
     prompt_set_to_filename,
-)  # usort: skip
+)
 from modelgauge.tests.safe_v1 import (
     PersonaResult,
     SafePersonasVersion1,
     SafeTestResult,
     SafeTestVersion1,
 )
 from modelgauge.tests.security import SecurityJailbreakTest, SecurityNaiveTest
+from modelgauge.versions import CURRENT_GENERAL_V_UNDER, CURRENT_GENERAL_VERSION, CURRENT_SECURITY_V_UNDER, CURRENT_SECURITY_VERSION
 
 
 @pytest.mark.parametrize("ai", ("ai", "AI", "aI", "Ai"))
@@ -53,9 +58,9 @@ def test_capitalization_doesnt_overgeneralize():
 @pytest.mark.parametrize("prompt_set", GENERAL_PROMPT_SETS.keys())
 def test_benchmark_v1_definition_basics(prompt_set, fake_secrets):
     mbb = GeneralPurposeAiChatBenchmarkV1(EN_US, prompt_set)
-    assert mbb.uid == f"general_purpose_ai_chat_benchmark-1.1-en_us-{prompt_set}-default"
+    assert mbb.uid == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-en_us-{prompt_set}-default"
     assert mbb.name() == "General Purpose AI Chat Benchmark V 1"
-    assert mbb.path_name() == f"general_purpose_ai_chat_benchmark-1_1-en_us-{prompt_set}-default"
+    assert mbb.path_name() == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_V_UNDER}-en_us-{prompt_set}-default"
     h = mbb.hazards()
     all_hazard_keys = ["vcr", "ncr", "src", "cse", "dfm", "spc", "prv", "ipv", "iwp", "hte", "ssh", "sxc"]
     assert len(h) == len(all_hazard_keys)
@@ -80,7 +85,7 @@ def assert_hazard_standards(benchmark):
     practice_default_benchmark = GeneralPurposeAiChatBenchmarkV1(locale, "practice", evaluator="default")
     assert (
         Standards._benchmark_standards_path(practice_benchmark.uid).name
-        == f"general_purpose_ai_chat_benchmark-1.1-{locale}-practice-private.json"
+        == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-{locale}-practice-private.json"
     )
     assert practice_default_benchmark.standards._data == practice_benchmark.standards._data
     assert_hazard_standards(practice_benchmark)
@@ -90,7 +95,7 @@ def assert_hazard_standards(benchmark):
     official_default_benchmark = GeneralPurposeAiChatBenchmarkV1(locale, "official", evaluator="default")
     assert (
         Standards._benchmark_standards_path(official_benchmark.uid).name
-        == f"general_purpose_ai_chat_benchmark-1.1-{locale}-official-private.json"
+        == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-{locale}-official-private.json"
     )
     assert official_default_benchmark.standards._data == official_benchmark.standards._data
     assert_hazard_standards(official_benchmark)
@@ -117,7 +122,7 @@ def assert_hazard_standards(benchmark):
     practice_default_benchmark = GeneralPurposeAiChatBenchmarkV1(ZH_CN, "practice", evaluator="default")
     assert (
         Standards._benchmark_standards_path(practice_benchmark.uid).name
-        == f"general_purpose_ai_chat_benchmark-1.1-{ZH_CN}-practice-private.json"
+        == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-{ZH_CN}-practice-private.json"
     )
     assert practice_default_benchmark.standards._data == practice_benchmark.standards._data
     assert_hazard_standards(practice_benchmark)
@@ -143,9 +148,9 @@ def check_dfm_score(benchmark, expected):
 @pytest.mark.parametrize("prompt_set", SECURITY_JAILBREAK_PROMPT_SETS.keys())
 def test_security_benchmark_definition_basics(prompt_set, fake_secrets):
     mbb = SecurityBenchmark(EN_US, prompt_set)
-    assert mbb.uid == f"security_benchmark-1.0-en_us-{prompt_set}-default"
+    assert mbb.uid == f"security_benchmark-{CURRENT_SECURITY_VERSION}-en_us-{prompt_set}-default"
     assert mbb.name() == "Security Benchmark"
-    assert mbb.path_name() == f"security_benchmark-1_0-en_us-{prompt_set}-default"
+    assert mbb.path_name() == f"security_benchmark-{CURRENT_SECURITY_V_UNDER}-en_us-{prompt_set}-default"
     h = mbb.hazards()
 
     assert len(h) == 2
@@ -171,7 +176,7 @@ def assert_hazard_standards(b):
     official_default_security_benchmark = SecurityBenchmark(EN_US, prompt_set, evaluator="default")
     assert (
         Standards._benchmark_standards_path(official_ensemble_security_benchmark.reference_benchmark().uid).name
-        == f"security_naive_benchmark-1.0-{EN_US}-official-private.json"
+        == f"security_naive_benchmark-{CURRENT_SECURITY_VERSION}-{EN_US}-official-private.json"
     )
     assert official_default_security_benchmark.standards._data == official_ensemble_security_benchmark.standards._data
     assert_hazard_standards(official_ensemble_security_benchmark)
@@ -288,12 +293,12 @@ def test_text_grade(self, probability, expected, expected_label, hazard_score):
 @pytest.mark.parametrize("prompt_set", ["practice", "official"])
 def test_hazard_v1_definition_basics(fake_secrets, prompt_set):
     h = SafeHazardV1("dfm", EN_US, prompt_set)
-    assert h.uid == f"safe_hazard-1.1-dfm-en_us-{prompt_set}"
+    assert h.uid == f"safe_hazard-{CURRENT_GENERAL_VERSION}-dfm-en_us-{prompt_set}"
     assert h.locale == EN_US
     assert h.prompt_set == prompt_set
 
-    assert h.reference_key == "safe_hazard-1_1-dfm"
-    assert h.name() == f"Safe Dfm en_US Hazard V1.1 ({prompt_set.title()})"
+    assert h.reference_key == f"safe_hazard-{CURRENT_GENERAL_V_UNDER}-dfm"
+    assert h.name() == f"Safe Dfm en_US Hazard V{CURRENT_GENERAL_VERSION} ({prompt_set.title()})"
 
     tests = h.tests(secrets=fake_secrets)
     assert len(tests) == 1
@@ -303,12 +308,12 @@ def test_hazard_v1_definition_basics(fake_secrets, prompt_set):
 
 def test_security_jailbreak_hazard_definition_basics(fake_secrets):
     h = SecurityJailbreakHazard(EN_US, "official")
-    assert h.uid == f"security_jailbreak_hazard-1.0-en_us-official"
+    assert h.uid == f"security_jailbreak_hazard-{CURRENT_SECURITY_VERSION}-en_us-official"
     assert h.locale == EN_US
     assert h.prompt_set == "official"
 
-    assert h.reference_key == "security_naive_official_hazard-1_0"
-    assert h.name() == f"Security Jailbreak en_US Hazard V1.0 (Official)"
+    assert h.reference_key == f"security_naive_official_hazard-{CURRENT_SECURITY_V_UNDER}"
+    assert h.name() == f"Security Jailbreak en_US Hazard V{CURRENT_SECURITY_VERSION} (Official)"
     tests = h.tests(secrets=fake_secrets)
     assert len(tests) == 1
     for t in tests:
@@ -317,12 +322,12 @@ def test_security_jailbreak_hazard_definition_basics(fake_secrets):
 
 def test_security_naive_hazard_definition_basics(fake_secrets):
     h = SecurityNaiveHazard(EN_US, "official")
-    assert h.uid == f"security_naive_hazard-1.0-en_us-official"
+    assert h.uid == f"security_naive_hazard-{CURRENT_SECURITY_VERSION}-en_us-official"
     assert h.locale == EN_US
     assert h.prompt_set == "official"
 
-    assert h.reference_key == "security_naive_official_hazard-1_0"
-    assert h.name() == f"Security Naive en_US Hazard V1.0 (Official)"
+    assert h.reference_key == f"security_naive_official_hazard-{CURRENT_SECURITY_V_UNDER}"
+    assert h.name() == f"Security Naive en_US Hazard V{CURRENT_SECURITY_VERSION} (Official)"
     tests = h.tests(secrets=fake_secrets)
     assert len(tests) == 1
     for t in tests: