mlcommons · superdosh · Jun 18, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 17, 2026
@@ -5,6 +5,8 @@
 from typing import List, Sequence
 
 import casefy
+
+from modelgauge.versions import CURRENT_SECURITY_VERSION
 from modelgauge.locales import DEFAULT_LOCALE, validate_locale
 from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS, validate_prompt_set
 from modelgauge.sut import PromptResponseSUT
@@ -255,7 +257,7 @@ def _make_hazards(self) -> Sequence[HazardDefinition]:
 
     _uid_definition = {
         "class": "security_naive_benchmark",
-        "version": "1.0",
+        "version": CURRENT_SECURITY_VERSION,
         "locale": "self.locale",
         "prompt_set": "self.prompt_set",
         "evaluator": "self.evaluator",
@@ -289,7 +291,7 @@ def _make_hazards(self) -> Sequence[HazardDefinition]:
 
     _uid_definition = {
         "class": "security_benchmark",
-        "version": "1.0",
+        "version": CURRENT_SECURITY_VERSION,
         "locale": "self.locale",
         "prompt_set": "self.prompt_set",
         "evaluator": "self.evaluator",

@@ -38,6 +38,7 @@
 from modelgauge.preflight import check_secrets, make_sut
 from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS
 from modelgauge.sut_registry import SUTS
+from modelgauge.versions import CURRENT_GENERAL_VERSION, CURRENT_SECURITY_VERSION
 
 
 def load_local_plugins(_, __, path: pathlib.Path):
@@ -167,9 +168,9 @@ def list_suts():
 @click.option(
     "--version",
     "-v",
-    type=click.Choice(["1.1"]),
-    default="1.1",
-    help="Benchmark version to run (Default: 1.1)",
+    type=click.Choice([CURRENT_GENERAL_VERSION]),
+    default=CURRENT_GENERAL_VERSION,
+    help=f"Benchmark version to run (Default: {CURRENT_GENERAL_VERSION})",
     multiple=False,
 )
 @benchmark_options(GENERAL_PROMPT_SETS, "demo")
@@ -200,10 +201,19 @@ def general_benchmark(
 
 
 @benchmark.command("security", help="run a security benchmark")
+@click.option(
+    "--version",
+    "-v",
+    type=click.Choice([CURRENT_SECURITY_VERSION]),
+    default=CURRENT_SECURITY_VERSION,
+    help=f"Benchmark version to run (Default: {CURRENT_SECURITY_VERSION})",
+    multiple=False,
+)
 @benchmark_options(SECURITY_JAILBREAK_PROMPT_SETS, "official")
 @click.pass_context
 def security_benchmark(
     ctx: click.Context,
+    version: str,
     output_dir: pathlib.Path,
     max_instances: int | None,
     debug: bool,

@@ -5,6 +5,12 @@
 import casefy
 from pydantic import BaseModel, ConfigDict
 
+from modelgauge.versions import (
+    CURRENT_GENERAL_V_UNDER,
+    CURRENT_GENERAL_VERSION,
+    CURRENT_SECURITY_V_UNDER,
+    CURRENT_SECURITY_VERSION,
+)
 from modelgauge.base_test import PromptResponseTest
 from modelgauge.locales import display_for, validate_locale
 from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS, validate_prompt_set
@@ -90,10 +96,10 @@ def __init__(self, hazard_key: str, locale: str, prompt_set: str, evaluator="def
 
     @property
     def reference_key(self):
-        return f"safe_hazard-1_1-{self.hazard_key}"
+        return f"safe_hazard-{CURRENT_GENERAL_V_UNDER}-{self.hazard_key}"
 
     def name(self):
-        return f"Safe {self.hazard_key.title()} {display_for(self.locale)} Hazard V1.1 ({self.prompt_set.title()})"
+        return f"Safe {self.hazard_key.title()} {display_for(self.locale)} Hazard V{CURRENT_GENERAL_VERSION} ({self.prompt_set.title()})"
 
     def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore":
         test_scores: dict[str, ValueEstimate] = {}
@@ -140,15 +146,15 @@ def test_uids(self) -> List[str]:
 
     _uid_definition = {
         "name": "safe_hazard",
-        "version": "1.1",
+        "version": CURRENT_GENERAL_VERSION,
         "hazard": "self.hazard_key",
         "locale": "self.locale",
         "prompt_set": "self.prompt_set",
     }
 
 
 class BaseSecurityHazard(HazardDefinition, ABC):
-    VERSION = "1.0"
+    VERSION = CURRENT_SECURITY_VERSION
     persona: str
 
     def __init__(self, locale: str, prompt_set: str, evaluator="default"):
@@ -165,7 +171,7 @@ def test_uids(self) -> List[str]:
     @property
     def reference_key(self):
         # Both jailbreak and naive hazards use the naive reference score.
-        return f"security_naive_official_hazard-{self.VERSION.replace('.','_')}"
+        return f"security_naive_official_hazard-{CURRENT_SECURITY_V_UNDER}"
 
     def name(self):
         return f"Security {self.persona.title()} {display_for(self.locale)} Hazard V{self.VERSION} ({self.prompt_set.title()})"

@@ -0,0 +1,22 @@
+{
+    "_metadata": {
+        "NOTICE": "This file is a demonstrative sample and does not reflect an actual benchmark standard.",
+        "file_format_version": "2.0.0",
+        "run_info": {
+            "sut_scores": {
+                "llama-3.1-8b-instruct-turbo-together": {
+                    "security_naive_hazard-1.0.1-en_us-official": 0.5
+                }
+            }
+        }
+    },
+    "standards": {
+        "reference_suts": [
+            "llama-3.1-8b-instruct-turbo-together"
+        ],
+        "reference_benchmark": "security_naive_benchmark-1.0.1-en_us-official-private",
+        "reference_standards": {
+            "security_naive_official_hazard-1_0_1": 0.5
+        }
+    }
+}
@@ -1,4 +1,8 @@
-from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V0_5_UID
+from modelgauge.annotators.cheval.ids import (
+    SAFETY_ANNOTATOR_V1_1_UID,
+    SECURITY_ANNOTATOR_V0_5_UID,
+    SECURITY_ANNOTATOR_V1_0_1_UID,
+)
 from modelgauge.instance_factory import InstanceFactory
 from modelgauge.annotator import Annotator
 
@@ -10,6 +14,7 @@
     "perspective_api": "perspective_api",
     SAFETY_ANNOTATOR_V1_1_UID: "cheval.registration",
     SECURITY_ANNOTATOR_V0_5_UID: "cheval.registration",
+    SECURITY_ANNOTATOR_V1_0_1_UID: "cheval.registration",
 }
 
 # The list of all Annotators instances with assigned UIDs.

@@ -1,2 +1,5 @@
-SAFETY_ANNOTATOR_V1_1_UID = "safety-v1.1"
+from modelgauge.versions import CURRENT_GENERAL_VERSION, CURRENT_SECURITY_VERSION
+
+SAFETY_ANNOTATOR_V1_1_UID = f"safety-v{CURRENT_GENERAL_VERSION}"
 SECURITY_ANNOTATOR_V0_5_UID = "security-v0.5"
+SECURITY_ANNOTATOR_V1_0_1_UID = f"security-v{CURRENT_SECURITY_VERSION}"
@@ -1,6 +1,6 @@
 from modelgauge.annotator_registry import ANNOTATORS
 from modelgauge.annotators.cheval.annotator import ChevalAPIKey, ChevalAnnotator, ChevalEndpointUrl
-from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V0_5_UID
+from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V1_0_1_UID
 from modelgauge.secret_values import InjectSecret
 
 ANNOTATORS.register(
@@ -13,7 +13,7 @@
 
 ANNOTATORS.register(
     ChevalAnnotator,
-    SECURITY_ANNOTATOR_V0_5_UID,
+    SECURITY_ANNOTATOR_V1_0_1_UID,
     InjectSecret(ChevalAPIKey),
     InjectSecret(ChevalEndpointUrl),
 )
@@ -29,6 +29,7 @@
 from modelgauge.sut_capabilities import AcceptsTextPrompt
 from modelgauge.test_decorator import modelgauge_test
 from modelgauge.test_registry import TESTS
+from modelgauge.versions import CURRENT_GENERAL_VERSION
 from pydantic import BaseModel
 
 
@@ -98,6 +99,7 @@ class BaseSafeTestVersion1(PromptResponseTest, ABC):
     Test UIDs follow the form "safe-{ hazard acronym }-1.1" (e.g. safe-vcr-1.1). The hazard acronyms are as follows:
     """
 
+    VERSION = CURRENT_GENERAL_VERSION
     hazards = Hazards()
     _sut_options = ModelOptions(
         max_tokens=3000,
@@ -113,7 +115,7 @@ def create_uid(hazard: str, locale: str, prompt_set: str, evaluator=None):
             postfix = ""
         else:
             postfix = "-" + evaluator
-        uid = f"safe-{hazard}-{locale}-{prompt_set}-1.1{postfix}".lower()
+        uid = f"safe-{hazard}-{locale}-{prompt_set}-{CURRENT_GENERAL_VERSION}{postfix}".lower()
         return uid
 
     def __init__(

@@ -5,7 +5,7 @@
 from pydantic import BaseModel
 
 from modelgauge.aggregations import get_measurement_stats
-from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID
+from modelgauge.annotators.cheval.ids import SECURITY_ANNOTATOR_V1_0_1_UID
 from modelgauge.base_test import PromptResponseTest
 from modelgauge.dependency_helper import DependencyHelper
 from modelgauge.external_data import ExternalData, WebData
@@ -32,6 +32,7 @@
 from modelgauge.tests.safe_v1 import Hazards
 from modelgauge.test_decorator import modelgauge_test
 from modelgauge.test_registry import TESTS
+from modelgauge.versions import CURRENT_SECURITY_VERSION
 
 
 class SecurityTestResult(BaseModel):
@@ -40,7 +41,7 @@ class SecurityTestResult(BaseModel):
 
 
 class BaseSecurityTest(PromptResponseTest, ABC):
-    VERSION = 1.0
+    VERSION = CURRENT_SECURITY_VERSION
     hazards = Hazards()
     prompt_sets: dict
     persona: str
@@ -171,14 +172,14 @@ def register_tests(cls, evaluator=None):
 class PrivateSecurityJailbreakTest(BaseSecurityJailbreakTest):
     @classmethod
     def get_annotators(cls) -> List[str]:
-        return [SAFETY_ANNOTATOR_V1_1_UID]
+        return [SECURITY_ANNOTATOR_V1_0_1_UID]
 
 
 @modelgauge_test(requires_sut_capabilities=[AcceptsTextPrompt])
 class PrivateSecurityNaiveTest(BaseSecurityNaiveTest):
     @classmethod
     def get_annotators(cls) -> List[str]:
-        return [SAFETY_ANNOTATOR_V1_1_UID]
+        return [SECURITY_ANNOTATOR_V1_0_1_UID]
 
 
 register_tests(PrivateSecurityJailbreakTest, "private")

@@ -0,0 +1,7 @@
+_GENERAL_V1_0 = "1.1"
+_SECURITY_V1_0_1 = "1.0.1"
+
+CURRENT_GENERAL_VERSION = _GENERAL_V1_0
+CURRENT_SECURITY_VERSION = _SECURITY_V1_0_1
+CURRENT_GENERAL_V_UNDER = CURRENT_GENERAL_VERSION.replace(".", "_")
+CURRENT_SECURITY_V_UNDER = CURRENT_SECURITY_VERSION.replace(".", "_")