diff --git a/src/modelbench/benchmarks.py b/src/modelbench/benchmarks.py
index 99b2c12c..0257cedd 100644
--- a/src/modelbench/benchmarks.py
+++ b/src/modelbench/benchmarks.py
@@ -5,6 +5,8 @@
 from typing import List, Sequence
 
 import casefy
+
+from modelgauge.versions import CURRENT_SECURITY_VERSION
 from modelgauge.locales import DEFAULT_LOCALE, validate_locale
 from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS, validate_prompt_set
 from modelgauge.sut import PromptResponseSUT
@@ -255,7 +257,7 @@ def _make_hazards(self) -> Sequence[HazardDefinition]:
 
     _uid_definition = {
         "class": "security_naive_benchmark",
-        "version": "1.0",
+        "version": CURRENT_SECURITY_VERSION,
         "locale": "self.locale",
         "prompt_set": "self.prompt_set",
         "evaluator": "self.evaluator",
@@ -289,7 +291,7 @@ def _make_hazards(self) -> Sequence[HazardDefinition]:
 
     _uid_definition = {
         "class": "security_benchmark",
-        "version": "1.0",
+        "version": CURRENT_SECURITY_VERSION,
         "locale": "self.locale",
         "prompt_set": "self.prompt_set",
         "evaluator": "self.evaluator",
diff --git a/src/modelbench/cli.py b/src/modelbench/cli.py
index f0b5cf03..9aa2e916 100644
--- a/src/modelbench/cli.py
+++ b/src/modelbench/cli.py
@@ -38,6 +38,7 @@
 from modelgauge.preflight import check_secrets, make_sut
 from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS
 from modelgauge.sut_registry import SUTS
+from modelgauge.versions import CURRENT_GENERAL_VERSION, CURRENT_SECURITY_VERSION
 
 
 def load_local_plugins(_, __, path: pathlib.Path):
@@ -167,9 +168,9 @@ def list_suts():
 @click.option(
     "--version",
     "-v",
-    type=click.Choice(["1.1"]),
-    default="1.1",
-    help="Benchmark version to run (Default: 1.1)",
+    type=click.Choice([CURRENT_GENERAL_VERSION]),
+    default=CURRENT_GENERAL_VERSION,
+    help=f"Benchmark version to run (Default: {CURRENT_GENERAL_VERSION})",
     multiple=False,
 )
 @benchmark_options(GENERAL_PROMPT_SETS, "demo")
@@ -200,10 +201,19 @@ def general_benchmark(
 
 
 @benchmark.command("security", help="run a security benchmark")
+@click.option(
+    "--version",
+    "-v",
+    type=click.Choice([CURRENT_SECURITY_VERSION]),
+    default=CURRENT_SECURITY_VERSION,
+    help=f"Benchmark version to run (Default: {CURRENT_SECURITY_VERSION})",
+    multiple=False,
+)
 @benchmark_options(SECURITY_JAILBREAK_PROMPT_SETS, "official")
 @click.pass_context
 def security_benchmark(
     ctx: click.Context,
+    version: str,
     output_dir: pathlib.Path,
     max_instances: int | None,
     debug: bool,
diff --git a/src/modelbench/hazards.py b/src/modelbench/hazards.py
index a4824b35..dc3d37e7 100644
--- a/src/modelbench/hazards.py
+++ b/src/modelbench/hazards.py
@@ -5,6 +5,12 @@
 import casefy
 from pydantic import BaseModel, ConfigDict
 
+from modelgauge.versions import (
+    CURRENT_GENERAL_V_UNDER,
+    CURRENT_GENERAL_VERSION,
+    CURRENT_SECURITY_V_UNDER,
+    CURRENT_SECURITY_VERSION,
+)
 from modelgauge.base_test import PromptResponseTest
 from modelgauge.locales import display_for, validate_locale
 from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS, validate_prompt_set
@@ -90,10 +96,10 @@ def __init__(self, hazard_key: str, locale: str, prompt_set: str, evaluator="def
 
     @property
     def reference_key(self):
-        return f"safe_hazard-1_1-{self.hazard_key}"
+        return f"safe_hazard-{CURRENT_GENERAL_V_UNDER}-{self.hazard_key}"
 
     def name(self):
-        return f"Safe {self.hazard_key.title()} {display_for(self.locale)} Hazard V1.1 ({self.prompt_set.title()})"
+        return f"Safe {self.hazard_key.title()} {display_for(self.locale)} Hazard V{CURRENT_GENERAL_VERSION} ({self.prompt_set.title()})"
 
     def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore":
         test_scores: dict[str, ValueEstimate] = {}
@@ -140,7 +146,7 @@ def test_uids(self) -> List[str]:
 
     _uid_definition = {
         "name": "safe_hazard",
-        "version": "1.1",
+        "version": CURRENT_GENERAL_VERSION,
         "hazard": "self.hazard_key",
         "locale": "self.locale",
         "prompt_set": "self.prompt_set",
@@ -148,7 +154,7 @@ def test_uids(self) -> List[str]:
 
 
 class BaseSecurityHazard(HazardDefinition, ABC):
-    VERSION = "1.0"
+    VERSION = CURRENT_SECURITY_VERSION
     persona: str
 
     def __init__(self, locale: str, prompt_set: str, evaluator="default"):
@@ -165,7 +171,7 @@ def test_uids(self) -> List[str]:
     @property
     def reference_key(self):
         # Both jailbreak and naive hazards use the naive reference score.
-        return f"security_naive_official_hazard-{self.VERSION.replace('.','_')}"
+        return f"security_naive_official_hazard-{CURRENT_SECURITY_V_UNDER}"
 
     def name(self):
         return f"Security {self.persona.title()} {display_for(self.locale)} Hazard V{self.VERSION} ({self.prompt_set.title()})"
diff --git a/src/modelbench/standards/security_naive_benchmark-1.0.1-en_us-official-private.json b/src/modelbench/standards/security_naive_benchmark-1.0.1-en_us-official-private.json
new file mode 100644
index 00000000..7a1b98a9
--- /dev/null
+++ b/src/modelbench/standards/security_naive_benchmark-1.0.1-en_us-official-private.json
@@ -0,0 +1,22 @@
+{
+    "_metadata": {
+        "NOTICE": "This file is a demonstrative sample and does not reflect an actual benchmark standard.",
+        "file_format_version": "2.0.0",
+        "run_info": {
+            "sut_scores": {
+                "llama-3.1-8b-instruct-turbo-together": {
+                    "security_naive_hazard-1.0.1-en_us-official": 0.5
+                }
+            }
+        }
+    },
+    "standards": {
+        "reference_suts": [
+            "llama-3.1-8b-instruct-turbo-together"
+        ],
+        "reference_benchmark": "security_naive_benchmark-1.0.1-en_us-official-private",
+        "reference_standards": {
+            "security_naive_official_hazard-1_0_1": 0.5
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/modelgauge/annotator_registry.py b/src/modelgauge/annotator_registry.py
index 6cfad021..1e6f0d4c 100644
--- a/src/modelgauge/annotator_registry.py
+++ b/src/modelgauge/annotator_registry.py
@@ -1,4 +1,8 @@
-from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V0_5_UID
+from modelgauge.annotators.cheval.ids import (
+    SAFETY_ANNOTATOR_V1_1_UID,
+    SECURITY_ANNOTATOR_V0_5_UID,
+    SECURITY_ANNOTATOR_V1_0_1_UID,
+)
 from modelgauge.instance_factory import InstanceFactory
 from modelgauge.annotator import Annotator
 
@@ -10,6 +14,7 @@
     "perspective_api": "perspective_api",
     SAFETY_ANNOTATOR_V1_1_UID: "cheval.registration",
     SECURITY_ANNOTATOR_V0_5_UID: "cheval.registration",
+    SECURITY_ANNOTATOR_V1_0_1_UID: "cheval.registration",
 }
 
 # The list of all Annotators instances with assigned UIDs.
diff --git a/src/modelgauge/annotators/cheval/ids.py b/src/modelgauge/annotators/cheval/ids.py
index 6ad15492..59f97ed7 100644
--- a/src/modelgauge/annotators/cheval/ids.py
+++ b/src/modelgauge/annotators/cheval/ids.py
@@ -1,2 +1,5 @@
-SAFETY_ANNOTATOR_V1_1_UID = "safety-v1.1"
+from modelgauge.versions import CURRENT_GENERAL_VERSION, CURRENT_SECURITY_VERSION
+
+SAFETY_ANNOTATOR_V1_1_UID = f"safety-v{CURRENT_GENERAL_VERSION}"
 SECURITY_ANNOTATOR_V0_5_UID = "security-v0.5"
+SECURITY_ANNOTATOR_V1_0_1_UID = f"security-v{CURRENT_SECURITY_VERSION}"
diff --git a/src/modelgauge/annotators/cheval/registration.py b/src/modelgauge/annotators/cheval/registration.py
index 068282f4..cabbcbd8 100644
--- a/src/modelgauge/annotators/cheval/registration.py
+++ b/src/modelgauge/annotators/cheval/registration.py
@@ -1,6 +1,6 @@
 from modelgauge.annotator_registry import ANNOTATORS
 from modelgauge.annotators.cheval.annotator import ChevalAPIKey, ChevalAnnotator, ChevalEndpointUrl
-from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V0_5_UID
+from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID, SECURITY_ANNOTATOR_V1_0_1_UID
 from modelgauge.secret_values import InjectSecret
 
 ANNOTATORS.register(
@@ -13,7 +13,7 @@
 
 ANNOTATORS.register(
     ChevalAnnotator,
-    SECURITY_ANNOTATOR_V0_5_UID,
+    SECURITY_ANNOTATOR_V1_0_1_UID,
     InjectSecret(ChevalAPIKey),
     InjectSecret(ChevalEndpointUrl),
 )
diff --git a/src/modelgauge/tests/safe_v1.py b/src/modelgauge/tests/safe_v1.py
index e6366a2e..6c2b6968 100644
--- a/src/modelgauge/tests/safe_v1.py
+++ b/src/modelgauge/tests/safe_v1.py
@@ -29,6 +29,7 @@
 from modelgauge.sut_capabilities import AcceptsTextPrompt
 from modelgauge.test_decorator import modelgauge_test
 from modelgauge.test_registry import TESTS
+from modelgauge.versions import CURRENT_GENERAL_VERSION
 from pydantic import BaseModel
 
 
@@ -98,6 +99,7 @@ class BaseSafeTestVersion1(PromptResponseTest, ABC):
     Test UIDs follow the form "safe-{ hazard acronym }-1.1" (e.g. safe-vcr-1.1). The hazard acronyms are as follows:
     """
 
+    VERSION = CURRENT_GENERAL_VERSION
     hazards = Hazards()
     _sut_options = ModelOptions(
         max_tokens=3000,
@@ -113,7 +115,7 @@ def create_uid(hazard: str, locale: str, prompt_set: str, evaluator=None):
             postfix = ""
         else:
             postfix = "-" + evaluator
-        uid = f"safe-{hazard}-{locale}-{prompt_set}-1.1{postfix}".lower()
+        uid = f"safe-{hazard}-{locale}-{prompt_set}-{CURRENT_GENERAL_VERSION}{postfix}".lower()
         return uid
 
     def __init__(
diff --git a/src/modelgauge/tests/security.py b/src/modelgauge/tests/security.py
index 26d6219d..0a5c0288 100644
--- a/src/modelgauge/tests/security.py
+++ b/src/modelgauge/tests/security.py
@@ -5,7 +5,7 @@
 from pydantic import BaseModel
 
 from modelgauge.aggregations import get_measurement_stats
-from modelgauge.annotators.cheval.ids import SAFETY_ANNOTATOR_V1_1_UID
+from modelgauge.annotators.cheval.ids import SECURITY_ANNOTATOR_V1_0_1_UID
 from modelgauge.base_test import PromptResponseTest
 from modelgauge.dependency_helper import DependencyHelper
 from modelgauge.external_data import ExternalData, WebData
@@ -32,6 +32,7 @@
 from modelgauge.tests.safe_v1 import Hazards
 from modelgauge.test_decorator import modelgauge_test
 from modelgauge.test_registry import TESTS
+from modelgauge.versions import CURRENT_SECURITY_VERSION
 
 
 class SecurityTestResult(BaseModel):
@@ -40,7 +41,7 @@ class SecurityTestResult(BaseModel):
 
 
 class BaseSecurityTest(PromptResponseTest, ABC):
-    VERSION = 1.0
+    VERSION = CURRENT_SECURITY_VERSION
     hazards = Hazards()
     prompt_sets: dict
     persona: str
@@ -171,14 +172,14 @@ def register_tests(cls, evaluator=None):
 class PrivateSecurityJailbreakTest(BaseSecurityJailbreakTest):
     @classmethod
     def get_annotators(cls) -> List[str]:
-        return [SAFETY_ANNOTATOR_V1_1_UID]
+        return [SECURITY_ANNOTATOR_V1_0_1_UID]
 
 
 @modelgauge_test(requires_sut_capabilities=[AcceptsTextPrompt])
 class PrivateSecurityNaiveTest(BaseSecurityNaiveTest):
     @classmethod
     def get_annotators(cls) -> List[str]:
-        return [SAFETY_ANNOTATOR_V1_1_UID]
+        return [SECURITY_ANNOTATOR_V1_0_1_UID]
 
 
 register_tests(PrivateSecurityJailbreakTest, "private")
diff --git a/src/modelgauge/versions.py b/src/modelgauge/versions.py
new file mode 100644
index 00000000..ca7551f7
--- /dev/null
+++ b/src/modelgauge/versions.py
@@ -0,0 +1,7 @@
+_GENERAL_V1_0 = "1.1"
+_SECURITY_V1_0_1 = "1.0.1"
+
+CURRENT_GENERAL_VERSION = _GENERAL_V1_0
+CURRENT_SECURITY_VERSION = _SECURITY_V1_0_1
+CURRENT_GENERAL_V_UNDER = CURRENT_GENERAL_VERSION.replace(".", "_")
+CURRENT_SECURITY_V_UNDER = CURRENT_SECURITY_VERSION.replace(".", "_")
diff --git a/tests/modelbench_tests/test_benchmark.py b/tests/modelbench_tests/test_benchmark.py
index ae17ba9b..53daad62 100644
--- a/tests/modelbench_tests/test_benchmark.py
+++ b/tests/modelbench_tests/test_benchmark.py
@@ -11,16 +11,20 @@
     GeneralPurposeAiChatBenchmarkV1,
     SecurityBenchmark,
 )
-from modelbench.hazards import HazardScore, SafeHazardV1, SecurityJailbreakHazard, SecurityNaiveHazard  # usort: skip
+from modelbench.hazards import (  # usort: skip
+    HazardScore,
+    SafeHazardV1,
+    SecurityJailbreakHazard,
+    SecurityNaiveHazard,
+)
 from modelbench.scoring import ValueEstimate
 from modelbench.standards import Standards
-
 from modelgauge.locales import EN_US, FR_FR, PUBLISHED_LOCALES, ZH_CN
-from modelgauge.prompt_sets import (
+from modelgauge.prompt_sets import (  # usort: skip
     GENERAL_PROMPT_SETS,
     SECURITY_JAILBREAK_PROMPT_SETS,
     prompt_set_to_filename,
-)  # usort: skip
+)
 from modelgauge.tests.safe_v1 import (
     PersonaResult,
     SafePersonasVersion1,
@@ -28,6 +32,12 @@
     SafeTestVersion1,
 )
 from modelgauge.tests.security import SecurityJailbreakTest, SecurityNaiveTest
+from modelgauge.versions import (
+    CURRENT_GENERAL_V_UNDER,
+    CURRENT_GENERAL_VERSION,
+    CURRENT_SECURITY_V_UNDER,
+    CURRENT_SECURITY_VERSION,
+)
 
 
 @pytest.mark.parametrize("ai", ("ai", "AI", "aI", "Ai"))
@@ -53,9 +63,9 @@ def test_capitalization_doesnt_overgeneralize():
 @pytest.mark.parametrize("prompt_set", GENERAL_PROMPT_SETS.keys())
 def test_benchmark_v1_definition_basics(prompt_set, fake_secrets):
     mbb = GeneralPurposeAiChatBenchmarkV1(EN_US, prompt_set)
-    assert mbb.uid == f"general_purpose_ai_chat_benchmark-1.1-en_us-{prompt_set}-default"
+    assert mbb.uid == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-en_us-{prompt_set}-default"
     assert mbb.name() == "General Purpose AI Chat Benchmark V 1"
-    assert mbb.path_name() == f"general_purpose_ai_chat_benchmark-1_1-en_us-{prompt_set}-default"
+    assert mbb.path_name() == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_V_UNDER}-en_us-{prompt_set}-default"
     h = mbb.hazards()
     all_hazard_keys = ["vcr", "ncr", "src", "cse", "dfm", "spc", "prv", "ipv", "iwp", "hte", "ssh", "sxc"]
     assert len(h) == len(all_hazard_keys)
@@ -80,7 +90,7 @@ def assert_hazard_standards(benchmark):
     practice_default_benchmark = GeneralPurposeAiChatBenchmarkV1(locale, "practice", evaluator="default")
     assert (
         Standards._benchmark_standards_path(practice_benchmark.uid).name
-        == f"general_purpose_ai_chat_benchmark-1.1-{locale}-practice-private.json"
+        == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-{locale}-practice-private.json"
     )
     assert practice_default_benchmark.standards._data == practice_benchmark.standards._data
     assert_hazard_standards(practice_benchmark)
@@ -90,7 +100,7 @@ def assert_hazard_standards(benchmark):
     official_default_benchmark = GeneralPurposeAiChatBenchmarkV1(locale, "official", evaluator="default")
     assert (
         Standards._benchmark_standards_path(official_benchmark.uid).name
-        == f"general_purpose_ai_chat_benchmark-1.1-{locale}-official-private.json"
+        == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-{locale}-official-private.json"
     )
     assert official_default_benchmark.standards._data == official_benchmark.standards._data
     assert_hazard_standards(official_benchmark)
@@ -117,7 +127,7 @@ def assert_hazard_standards(benchmark):
     practice_default_benchmark = GeneralPurposeAiChatBenchmarkV1(ZH_CN, "practice", evaluator="default")
     assert (
         Standards._benchmark_standards_path(practice_benchmark.uid).name
-        == f"general_purpose_ai_chat_benchmark-1.1-{ZH_CN}-practice-private.json"
+        == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-{ZH_CN}-practice-private.json"
     )
     assert practice_default_benchmark.standards._data == practice_benchmark.standards._data
     assert_hazard_standards(practice_benchmark)
@@ -143,9 +153,9 @@ def check_dfm_score(benchmark, expected):
 @pytest.mark.parametrize("prompt_set", SECURITY_JAILBREAK_PROMPT_SETS.keys())
 def test_security_benchmark_definition_basics(prompt_set, fake_secrets):
     mbb = SecurityBenchmark(EN_US, prompt_set)
-    assert mbb.uid == f"security_benchmark-1.0-en_us-{prompt_set}-default"
+    assert mbb.uid == f"security_benchmark-{CURRENT_SECURITY_VERSION}-en_us-{prompt_set}-default"
     assert mbb.name() == "Security Benchmark"
-    assert mbb.path_name() == f"security_benchmark-1_0-en_us-{prompt_set}-default"
+    assert mbb.path_name() == f"security_benchmark-{CURRENT_SECURITY_V_UNDER}-en_us-{prompt_set}-default"
     h = mbb.hazards()
 
     assert len(h) == 2
@@ -171,7 +181,7 @@ def assert_hazard_standards(b):
     official_default_security_benchmark = SecurityBenchmark(EN_US, prompt_set, evaluator="default")
     assert (
         Standards._benchmark_standards_path(official_ensemble_security_benchmark.reference_benchmark().uid).name
-        == f"security_naive_benchmark-1.0-{EN_US}-official-private.json"
+        == f"security_naive_benchmark-{CURRENT_SECURITY_VERSION}-{EN_US}-official-private.json"
     )
     assert official_default_security_benchmark.standards._data == official_ensemble_security_benchmark.standards._data
     assert_hazard_standards(official_ensemble_security_benchmark)
@@ -185,7 +195,7 @@ def check_score(benchmark, expected):
         for h in benchmark.hazards():
             assert h.reference_standard() == expected
 
-    expected = 0.8566666666666667
+    expected = 0.5
     check_score(SecurityBenchmark(EN_US, "official", evaluator="private"), expected)
     check_score(SecurityBenchmark(EN_US, "official", evaluator="default"), expected)
     # TODO: Add more spot checks here when we add more benchmarks.
@@ -288,12 +298,12 @@ def test_text_grade(self, probability, expected, expected_label, hazard_score):
 @pytest.mark.parametrize("prompt_set", ["practice", "official"])
 def test_hazard_v1_definition_basics(fake_secrets, prompt_set):
     h = SafeHazardV1("dfm", EN_US, prompt_set)
-    assert h.uid == f"safe_hazard-1.1-dfm-en_us-{prompt_set}"
+    assert h.uid == f"safe_hazard-{CURRENT_GENERAL_VERSION}-dfm-en_us-{prompt_set}"
     assert h.locale == EN_US
     assert h.prompt_set == prompt_set
 
-    assert h.reference_key == "safe_hazard-1_1-dfm"
-    assert h.name() == f"Safe Dfm en_US Hazard V1.1 ({prompt_set.title()})"
+    assert h.reference_key == f"safe_hazard-{CURRENT_GENERAL_V_UNDER}-dfm"
+    assert h.name() == f"Safe Dfm en_US Hazard V{CURRENT_GENERAL_VERSION} ({prompt_set.title()})"
 
     tests = h.tests(secrets=fake_secrets)
     assert len(tests) == 1
@@ -303,12 +313,12 @@ def test_hazard_v1_definition_basics(fake_secrets, prompt_set):
 
 def test_security_jailbreak_hazard_definition_basics(fake_secrets):
     h = SecurityJailbreakHazard(EN_US, "official")
-    assert h.uid == f"security_jailbreak_hazard-1.0-en_us-official"
+    assert h.uid == f"security_jailbreak_hazard-{CURRENT_SECURITY_VERSION}-en_us-official"
     assert h.locale == EN_US
     assert h.prompt_set == "official"
 
-    assert h.reference_key == "security_naive_official_hazard-1_0"
-    assert h.name() == f"Security Jailbreak en_US Hazard V1.0 (Official)"
+    assert h.reference_key == f"security_naive_official_hazard-{CURRENT_SECURITY_V_UNDER}"
+    assert h.name() == f"Security Jailbreak en_US Hazard V{CURRENT_SECURITY_VERSION} (Official)"
     tests = h.tests(secrets=fake_secrets)
     assert len(tests) == 1
     for t in tests:
@@ -317,12 +327,12 @@ def test_security_jailbreak_hazard_definition_basics(fake_secrets):
 
 def test_security_naive_hazard_definition_basics(fake_secrets):
     h = SecurityNaiveHazard(EN_US, "official")
-    assert h.uid == f"security_naive_hazard-1.0-en_us-official"
+    assert h.uid == f"security_naive_hazard-{CURRENT_SECURITY_VERSION}-en_us-official"
     assert h.locale == EN_US
     assert h.prompt_set == "official"
 
-    assert h.reference_key == "security_naive_official_hazard-1_0"
-    assert h.name() == f"Security Naive en_US Hazard V1.0 (Official)"
+    assert h.reference_key == f"security_naive_official_hazard-{CURRENT_SECURITY_V_UNDER}"
+    assert h.name() == f"Security Naive en_US Hazard V{CURRENT_SECURITY_VERSION} (Official)"
     tests = h.tests(secrets=fake_secrets)
     assert len(tests) == 1
     for t in tests:
diff --git a/tests/modelbench_tests/test_record.py b/tests/modelbench_tests/test_record.py
index 73cf4f89..2f590bd8 100644
--- a/tests/modelbench_tests/test_record.py
+++ b/tests/modelbench_tests/test_record.py
@@ -23,6 +23,7 @@
 from modelgauge.record_init import InitializationRecord
 from modelgauge.sut import PromptResponseSUT
 from modelgauge.sut_decorator import modelgauge_sut
+from modelgauge.versions import CURRENT_GENERAL_VERSION, CURRENT_SECURITY_VERSION
 
 
 def benchmark_run_record(benchmark_score):
@@ -181,25 +182,25 @@ def test_v1_hazard_definition_with_tests_loaded(secrets):
     hazard.tests(secrets)
     j = encode_and_parse(hazard)
     assert j["uid"] == hazard.uid
-    assert j["tests"] == ["safe-dfm-en_us-practice-1.1"]
+    assert j["tests"] == [f"safe-dfm-en_us-practice-{CURRENT_GENERAL_VERSION}"]
     assert j["reference_standard"] == hazard.reference_standard()
 
 
 def test_general_benchmark_definition():
     j = encode_and_parse(GeneralPurposeAiChatBenchmarkV1(locale=EN_US, prompt_set="practice"))
-    assert j["uid"] == "general_purpose_ai_chat_benchmark-1.1-en_us-practice-default"
-    assert j["version"] == "1.1"
+    assert j["uid"] == f"general_purpose_ai_chat_benchmark-{CURRENT_GENERAL_VERSION}-en_us-practice-default"
+    assert j["version"] == CURRENT_GENERAL_VERSION
     assert j["prompt_set"] == "practice"
-    assert "safe_hazard-1.1-cse-en_us-practice" in [i["uid"] for i in j["hazards"]]
+    assert f"safe_hazard-{CURRENT_GENERAL_VERSION}-cse-en_us-practice" in [i["uid"] for i in j["hazards"]]
 
 
 def test_security_benchmark_definition():
     j = encode_and_parse(SecurityBenchmark(locale=EN_US, prompt_set="official"))
-    assert j["uid"] == "security_benchmark-1.0-en_us-official-default"
-    assert j["version"] == "1.0"
+    assert j["uid"] == f"security_benchmark-{CURRENT_SECURITY_VERSION}-en_us-official-default"
+    assert j["version"] == CURRENT_SECURITY_VERSION
     hazard_uids = [i["uid"] for i in j["hazards"]]
-    assert "security_jailbreak_hazard-1.0-en_us-official" in hazard_uids
-    assert "security_naive_hazard-1.0-en_us-official" in hazard_uids
+    assert f"security_jailbreak_hazard-{CURRENT_SECURITY_VERSION}-en_us-official" in hazard_uids
+    assert f"security_naive_hazard-{CURRENT_SECURITY_VERSION}-en_us-official" in hazard_uids
 
 
 def test_hazard_score():
diff --git a/tests/modelbench_tests/test_run.py b/tests/modelbench_tests/test_run.py
index 09ffaa68..136a8335 100644
--- a/tests/modelbench_tests/test_run.py
+++ b/tests/modelbench_tests/test_run.py
@@ -40,6 +40,7 @@
 from modelgauge.secret_values import RawSecrets
 from modelgauge.single_turn_prompt_response import TestItem
 from modelgauge.sut import PromptResponseSUT, SUTResponse
+from modelgauge.versions import CURRENT_GENERAL_VERSION
 
 
 class AHazard(HazardDefinition):
@@ -74,7 +75,8 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
 
     # these will be used to validate the contents of artifacts created by a run object
     test1 = ModelgaugeTestWrapper(
-        actual_test=MagicMock(uid="test1-hazard-locale-official-1.1"), dependency_data_path=tmp_path
+        actual_test=MagicMock(uid=f"test1-hazard-locale-official-{CURRENT_GENERAL_VERSION}"),
+        dependency_data_path=tmp_path,
     )
     test_item1 = TestItem(
         prompt=TextPrompt(text="Why did the chicken cross the road?"), source_id="id1", shareable=True
@@ -92,7 +94,8 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
 
     # this one should not be included in the annotations file b/c it's marked as non shareable
     test2 = ModelgaugeTestWrapper(
-        actual_test=MagicMock(uid="test2-hazard-locale-practice-1.1"), dependency_data_path=tmp_path
+        actual_test=MagicMock(uid=f"test2-hazard-locale-practice-{CURRENT_GENERAL_VERSION}"),
+        dependency_data_path=tmp_path,
     )
     test_item2 = TestItem(
         prompt=TextPrompt(text="What is the airspeed of an unladen swallow?"), source_id="id2", shareable=False
@@ -109,7 +112,7 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
     benchmark_run.add_finished_item(tri2)
 
     test3 = ModelgaugeTestWrapper(
-        actual_test=MagicMock(uid="test3-hazard-locale-demo-1.1"), dependency_data_path=tmp_path
+        actual_test=MagicMock(uid=f"test3-hazard-locale-demo-{CURRENT_GENERAL_VERSION}"), dependency_data_path=tmp_path
     )
     test_item3 = TestItem(prompt=TextPrompt(text="Who's a good boy?"), source_id="id3", shareable=True)
     tri3 = TestRunItem(
@@ -125,7 +128,8 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
 
     # this one should not be included in the annotations file b/c it doesn't have availability set explicitly
     test4 = ModelgaugeTestWrapper(
-        actual_test=MagicMock(uid="test4-hazard-locale-practice-1.1"), dependency_data_path=tmp_path
+        actual_test=MagicMock(uid=f"test4-hazard-locale-practice-{CURRENT_GENERAL_VERSION}"),
+        dependency_data_path=tmp_path,
     )
     test_item4 = TestItem(
         prompt=TextPrompt(text="Is there anyone on board who knows how to fly a plane?"), source_id="id4"
@@ -143,7 +147,8 @@ def fake_benchmark_run(benchmark, hazards, sut, tmp_path):
 
     # this one should not be included in the annotations file b/c its prompt set is official
     test5 = ModelgaugeTestWrapper(
-        actual_test=MagicMock(uid="test5-hazard-locale-official-1.1"), dependency_data_path=tmp_path
+        actual_test=MagicMock(uid=f"test5-hazard-locale-official-{CURRENT_GENERAL_VERSION}"),
+        dependency_data_path=tmp_path,
     )
     test_item5 = TestItem(
         prompt=TextPrompt(text="What is the way to Carnegie Hall?"),
@@ -308,11 +313,11 @@ def invoke(command, args=None, **kwargs):
     @pytest.mark.parametrize(
         "version,locale,prompt_set",
         [
-            ("1.1", None, None),
-            ("1.1", EN_US, None),
-            ("1.1", EN_US, "practice"),
-            ("1.1", EN_US, "demo"),
-            ("1.1", EN_US, "official"),
+            (CURRENT_GENERAL_VERSION, None, None),
+            (CURRENT_GENERAL_VERSION, EN_US, None),
+            (CURRENT_GENERAL_VERSION, EN_US, "practice"),
+            (CURRENT_GENERAL_VERSION, EN_US, "demo"),
+            (CURRENT_GENERAL_VERSION, EN_US, "official"),
         ],
         # TODO add more locales as we add support for them
     )
@@ -388,11 +393,11 @@ def test_benchmark_basic_run_produces_json(
     @pytest.mark.parametrize(
         "version,locale,prompt_set",
         [
-            ("1.1", None, None),
-            ("1.1", EN_US, None),
-            ("1.1", EN_US, "official"),
-            ("1.1", FR_FR, "practice"),
-            ("1.1", FR_FR, "official"),
+            (CURRENT_GENERAL_VERSION, None, None),
+            (CURRENT_GENERAL_VERSION, EN_US, None),
+            (CURRENT_GENERAL_VERSION, EN_US, "official"),
+            (CURRENT_GENERAL_VERSION, FR_FR, "practice"),
+            (CURRENT_GENERAL_VERSION, FR_FR, "official"),
         ],
         # TODO add more locales as we add support for them
     )
@@ -400,7 +405,6 @@ def test_benchmark_basic_run_produces_json(
     def test_benchmark_multiple_suts_produces_json(
         self, mock_run_benchmarks, runner, version, locale, prompt_set, sut_uid, run_dir, monkeypatch
     ):
-
         benchmark_options = ["--version", version]
         if locale is not None:
             benchmark_options.extend(["--locale", locale])
@@ -454,7 +458,7 @@ def test_general_benchmark_exits_when_consistency_fails(self, runner, benchmark_
         assert result.exit_code == ConsistencyCheckError.EXIT_CODE
 
     def test_benchmark_bad_sut_errors_out(self, runner):
-        benchmark_options = ["--version", "1.1"]
+        benchmark_options = ["--version", CURRENT_GENERAL_VERSION]
         benchmark_options.extend(["--locale", "en_us"])
         benchmark_options.extend(["--prompt-set", "practice"])
 
@@ -508,6 +512,7 @@ def test_calls_score_benchmark_with_correct_v1_locale(self, runner, mock_run_ben
     #
     #     benchmark_arg = mock_score_benchmarks.call_args.args[0][0]
     #     assert isinstance(benchmark_arg, GeneralPurposeAiChatBenchmark)
+
     @pytest.mark.parametrize("sut_uid", ["fake-sut"])
     def test_v1_en_us_demo_is_default(self, runner, mock_run_benchmarks, sut_uid):
         _ = runner(cli, ["benchmark", "general", "--sut", sut_uid])
diff --git a/tests/modelgauge_tests/test_security.py b/tests/modelgauge_tests/test_security.py
index 437ec692..f616e4ed 100644
--- a/tests/modelgauge_tests/test_security.py
+++ b/tests/modelgauge_tests/test_security.py
@@ -9,6 +9,7 @@
 from modelgauge.single_turn_prompt_response import MeasuredTestItem, SUTResponseAnnotations, TestItem
 from modelgauge.sut import SUTResponse
 from modelgauge.tests.security import SecurityJailbreakTest, SecurityNaiveTest
+from modelgauge.versions import CURRENT_SECURITY_VERSION
 
 
 def _make_prompts_file(content, tmp_path):
@@ -70,18 +71,18 @@ def security_naive_test():
 
 def test_create_uid_jailbreak():
     uid = SecurityJailbreakTest.create_uid(EN_US, "official")
-    assert uid == "security-jailbreak-en_us-official-1.0"
+    assert uid == f"security-jailbreak-en_us-official-{CURRENT_SECURITY_VERSION}"
 
     private_uid = SecurityJailbreakTest.create_uid(EN_US, "official", "private")
-    assert private_uid == "security-jailbreak-en_us-official-1.0-private"
+    assert private_uid == f"security-jailbreak-en_us-official-{CURRENT_SECURITY_VERSION}-private"
 
 
 def test_create_uid_naive():
     uid = SecurityNaiveTest.create_uid(EN_US, "official")
-    assert uid == "security-naive-en_us-official-1.0"
+    assert uid == f"security-naive-en_us-official-{CURRENT_SECURITY_VERSION}"
 
     private_uid = SecurityNaiveTest.create_uid(EN_US, "official", "private")
-    assert private_uid == "security-naive-en_us-official-1.0-private"
+    assert private_uid == f"security-naive-en_us-official-{CURRENT_SECURITY_VERSION}-private"
 
 
 def test_make_test_items_jailbreak(dependency_helper_jailbreak, security_jailbreak_test):