Skip to content

Commit 503be3f

Browse files
committed
Add targeted_attack import and update properties; change model names in normalization strategies
1 parent f0f675a commit 503be3f

4 files changed

Lines changed: 15 additions & 14 deletions

File tree

silverspeak/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
__version__ = "unknown"
3838

3939
from silverspeak.homoglyphs.attacks.greedy_attack import greedy_attack
40+
from silverspeak.homoglyphs.attacks.targeted_attack import targeted_attack
4041
from silverspeak.homoglyphs.homoglyph_replacer import HomoglyphReplacer
4142
from silverspeak.homoglyphs.normalize import normalize_text
4243
from silverspeak.homoglyphs.attacks.random_attack import random_attack
@@ -56,6 +57,7 @@ def get_version() -> str:
5657
__all__ = [
5758
"random_attack",
5859
"greedy_attack",
60+
"targeted_attack",
5961
"normalize_text",
6062
"HomoglyphReplacer",
6163
"TypesOfHomoglyphs",

silverspeak/homoglyphs/attacks/targeted_attack.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -140,15 +140,14 @@ def targeted_attack(
140140
score = score_homoglyphs_for_character(
141141
homoglyph=homoglyph,
142142
char=char,
143-
PROPERTIES=[
144-
{
145-
"script": {"fn": unicodedataplus.script, "weight": 3},
146-
"block": {"fn": unicodedataplus.block, "weight": 5},
147-
"category": {"fn": unicodedata.category, "weight": 10},
148-
"bidirectional": {"fn": unicodedata.bidirectional, "weight": 2},
149-
"east_asian_width": {"fn": unicodedata.east_asian_width, "weight": 1},
150-
}
151-
],
143+
PROPERTIES={
144+
"script": {"fn": unicodedataplus.script, "weight": 2},
145+
"block": {"fn": unicodedataplus.block, "weight": 5},
146+
"plane": {"fn": lambda c: ord(c) >> 16, "weight": 3},
147+
"category": {"fn": unicodedata.category, "weight": 2},
148+
"bidirectional": {"fn": unicodedata.bidirectional, "weight": 2},
149+
"east_asian_width": {"fn": unicodedata.east_asian_width, "weight": 1},
150+
},
152151
)
153152

154153
possible_replacements.append((homoglyph, score))

silverspeak/homoglyphs/normalization/llm_prompt.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
def apply_llm_prompt_strategy(
1919
text: str,
2020
mapping: Mapping[str, List[str]],
21-
model_name: str = "google/gemma-2-1b-it",
21+
model_name: str = "google/gemma-3-1b-it",
2222
device: Optional[str] = None,
2323
max_length: int = 512,
2424
temperature: float = 0.0,
@@ -37,7 +37,7 @@ def apply_llm_prompt_strategy(
3737
mapping (Mapping[str, List[str]]): A mapping from original characters to
3838
their possible homoglyph replacements.
3939
model_name (str): The HuggingFace model name to load.
40-
Defaults to "google/gemma-2-1b-it".
40+
Defaults to "google/gemma-3-1b-it".
4141
device (Optional[str]): Device to run the model on ('cuda', 'cpu', etc.).
4242
Defaults to cuda if available, otherwise cpu.
4343
max_length (int): Maximum length of text segments to process. Longer text will be split.
@@ -118,11 +118,11 @@ def apply_llm_prompt_strategy(
118118
For example: {homoglyph_info}
119119
120120
Your task is to read the provided text which may contain homoglyphs (visually similar characters from different scripts)
121-
and produce a normalized version with standard Latin characters.
121+
and produce a normalized version with the correct characters.
122122
123123
Important instructions:
124124
1. Identify any homoglyphs or suspicious characters that might be replacements
125-
2. Replace them with their standard Latin equivalents
125+
2. Replace them with their correct characters (which are often in the same alphabet/script as the surrounding text)
126126
3. Preserve the exact wording, spacing, and punctuation of the original text
127127
4. If you're uncertain about a character, keep it as is
128128
5. Return ONLY the normalized text without any explanations or additional comments

silverspeak/homoglyphs/normalization/tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
def apply_tokenizer_strategy(
1717
text: str,
1818
mapping: Mapping[str, List[str]],
19-
tokenizer_name: str = "google/gemma-3-1b-pt",
19+
tokenizer_name: str = "google/gemma-3-1b-it",
2020
**kwargs,
2121
) -> str:
2222
"""

0 commit comments

Comments
 (0)