-
Notifications
You must be signed in to change notification settings - Fork 30
Security v1.0.1 definition #1539
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d87b8c1
e1aa737
4c47965
fc99d5c
4d6ea9e
b779e7f
ca89a2f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,12 @@ | |
| import casefy | ||
| from pydantic import BaseModel, ConfigDict | ||
|
|
||
| from modelgauge.versions import ( | ||
| CURRENT_GENERAL_V_UNDER, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice. |
||
| CURRENT_GENERAL_VERSION, | ||
| CURRENT_SECURITY_V_UNDER, | ||
| CURRENT_SECURITY_VERSION, | ||
| ) | ||
| from modelgauge.base_test import PromptResponseTest | ||
| from modelgauge.locales import display_for, validate_locale | ||
| from modelgauge.prompt_sets import GENERAL_PROMPT_SETS, SECURITY_JAILBREAK_PROMPT_SETS, validate_prompt_set | ||
|
|
@@ -90,10 +96,10 @@ def __init__(self, hazard_key: str, locale: str, prompt_set: str, evaluator="def | |
|
|
||
| @property | ||
| def reference_key(self): | ||
| return f"safe_hazard-1_1-{self.hazard_key}" | ||
| return f"safe_hazard-{CURRENT_GENERAL_V_UNDER}-{self.hazard_key}" | ||
|
|
||
| def name(self): | ||
| return f"Safe {self.hazard_key.title()} {display_for(self.locale)} Hazard V1.1 ({self.prompt_set.title()})" | ||
| return f"Safe {self.hazard_key.title()} {display_for(self.locale)} Hazard V{CURRENT_GENERAL_VERSION} ({self.prompt_set.title()})" | ||
|
|
||
| def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore": | ||
| test_scores: dict[str, ValueEstimate] = {} | ||
|
|
@@ -140,15 +146,15 @@ def test_uids(self) -> List[str]: | |
|
|
||
| _uid_definition = { | ||
| "name": "safe_hazard", | ||
| "version": "1.1", | ||
| "version": CURRENT_GENERAL_VERSION, | ||
| "hazard": "self.hazard_key", | ||
| "locale": "self.locale", | ||
| "prompt_set": "self.prompt_set", | ||
| } | ||
|
|
||
|
|
||
| class BaseSecurityHazard(HazardDefinition, ABC): | ||
| VERSION = "1.0" | ||
| VERSION = CURRENT_SECURITY_VERSION | ||
| persona: str | ||
|
|
||
| def __init__(self, locale: str, prompt_set: str, evaluator="default"): | ||
|
|
@@ -165,7 +171,7 @@ def test_uids(self) -> List[str]: | |
| @property | ||
| def reference_key(self): | ||
| # Both jailbreak and naive hazards use the naive reference score. | ||
| return f"security_naive_official_hazard-{self.VERSION.replace('.','_')}" | ||
| return f"security_naive_official_hazard-{CURRENT_SECURITY_V_UNDER}" | ||
|
|
||
| def name(self): | ||
| return f"Security {self.persona.title()} {display_for(self.locale)} Hazard V{self.VERSION} ({self.prompt_set.title()})" | ||
|
|
||
|
superdosh marked this conversation as resolved.
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| { | ||
| "_metadata": { | ||
| "NOTICE": "This file is a demonstrative sample and does not reflect an actual benchmark standard.", | ||
| "file_format_version": "2.0.0", | ||
| "run_info": { | ||
| "sut_scores": { | ||
| "llama-3.1-8b-instruct-turbo-together": { | ||
| "security_naive_hazard-1.0.1-en_us-official": 0.5 | ||
| } | ||
| } | ||
| } | ||
| }, | ||
| "standards": { | ||
| "reference_suts": [ | ||
| "llama-3.1-8b-instruct-turbo-together" | ||
| ], | ||
| "reference_benchmark": "security_naive_benchmark-1.0.1-en_us-official-private", | ||
| "reference_standards": { | ||
| "security_naive_official_hazard-1_0_1": 0.5 | ||
| } | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,2 +1,5 @@ | ||
| SAFETY_ANNOTATOR_V1_1_UID = "safety-v1.1" | ||
| from modelgauge.versions import CURRENT_GENERAL_VERSION, CURRENT_SECURITY_VERSION | ||
|
|
||
| SAFETY_ANNOTATOR_V1_1_UID = f"safety-v{CURRENT_GENERAL_VERSION}" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like this! |
||
| SECURITY_ANNOTATOR_V0_5_UID = "security-v0.5" | ||
| SECURITY_ANNOTATOR_V1_0_1_UID = f"security-v{CURRENT_SECURITY_VERSION}" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And this! |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| _GENERAL_V1_0 = "1.1" | ||
| _SECURITY_V1_0_1 = "1.0.1" | ||
|
|
||
| CURRENT_GENERAL_VERSION = _GENERAL_V1_0 | ||
| CURRENT_SECURITY_VERSION = _SECURITY_V1_0_1 | ||
| CURRENT_GENERAL_V_UNDER = CURRENT_GENERAL_VERSION.replace(".", "_") | ||
| CURRENT_SECURITY_V_UNDER = CURRENT_SECURITY_VERSION.replace(".", "_") |
Uh oh!
There was an error while loading. Please reload this page.