Add targeted_attack import and update properties; change model names in normalization strategies

ACMCMC · ACMCMC · commit 503be3fc67f0 · 2025-05-25T21:56:26.000+01:00
diff --git a/silverspeak/__init__.py b/silverspeak/__init__.py
@@ -37,6 +37,7 @@
         __version__ = "unknown"
 
 from silverspeak.homoglyphs.attacks.greedy_attack import greedy_attack
+from silverspeak.homoglyphs.attacks.targeted_attack import targeted_attack
 from silverspeak.homoglyphs.homoglyph_replacer import HomoglyphReplacer
 from silverspeak.homoglyphs.normalize import normalize_text
 from silverspeak.homoglyphs.attacks.random_attack import random_attack
@@ -56,6 +57,7 @@ def get_version() -> str:
 __all__ = [
     "random_attack",
     "greedy_attack",
+    "targeted_attack",
     "normalize_text",
     "HomoglyphReplacer",
     "TypesOfHomoglyphs",
diff --git a/silverspeak/homoglyphs/attacks/targeted_attack.py b/silverspeak/homoglyphs/attacks/targeted_attack.py
@@ -140,15 +140,14 @@ def targeted_attack(
                 score = score_homoglyphs_for_character(
                     homoglyph=homoglyph,
                     char=char,
-                    PROPERTIES=[
-                        {
-                            "script": {"fn": unicodedataplus.script, "weight": 3},
-                            "block": {"fn": unicodedataplus.block, "weight": 5},
-                            "category": {"fn": unicodedata.category, "weight": 10},
-                            "bidirectional": {"fn": unicodedata.bidirectional, "weight": 2},
-                            "east_asian_width": {"fn": unicodedata.east_asian_width, "weight": 1},
-                        }
-                    ],
+                    PROPERTIES={
+                        "script": {"fn": unicodedataplus.script, "weight": 2},
+                        "block": {"fn": unicodedataplus.block, "weight": 5},
+                        "plane": {"fn": lambda c: ord(c) >> 16, "weight": 3},
+                        "category": {"fn": unicodedata.category, "weight": 2},
+                        "bidirectional": {"fn": unicodedata.bidirectional, "weight": 2},
+                        "east_asian_width": {"fn": unicodedata.east_asian_width, "weight": 1},
+                    },
                 )
 
                 possible_replacements.append((homoglyph, score))
diff --git a/silverspeak/homoglyphs/normalization/llm_prompt.py b/silverspeak/homoglyphs/normalization/llm_prompt.py
@@ -18,7 +18,7 @@
 def apply_llm_prompt_strategy(
     text: str,
     mapping: Mapping[str, List[str]],
-    model_name: str = "google/gemma-2-1b-it",
+    model_name: str = "google/gemma-3-1b-it",
     device: Optional[str] = None,
     max_length: int = 512,
     temperature: float = 0.0,
@@ -37,7 +37,7 @@ def apply_llm_prompt_strategy(
         mapping (Mapping[str, List[str]]): A mapping from original characters to
             their possible homoglyph replacements.
         model_name (str): The HuggingFace model name to load.
-            Defaults to "google/gemma-2-1b-it".
+            Defaults to "google/gemma-3-1b-it".
         device (Optional[str]): Device to run the model on ('cuda', 'cpu', etc.).
             Defaults to cuda if available, otherwise cpu.
         max_length (int): Maximum length of text segments to process. Longer text will be split.
@@ -118,11 +118,11 @@ def apply_llm_prompt_strategy(
 For example: {homoglyph_info}
 
 Your task is to read the provided text which may contain homoglyphs (visually similar characters from different scripts)
-and produce a normalized version with standard Latin characters.
+and produce a normalized version with the correct characters.
 
 Important instructions:
 1. Identify any homoglyphs or suspicious characters that might be replacements
-2. Replace them with their standard Latin equivalents
+2. Replace them with their correct characters (which are often in the same alphabet/script as the surrounding text)
 3. Preserve the exact wording, spacing, and punctuation of the original text
 4. If you're uncertain about a character, keep it as is
 5. Return ONLY the normalized text without any explanations or additional comments
diff --git a/silverspeak/homoglyphs/normalization/tokenizer.py b/silverspeak/homoglyphs/normalization/tokenizer.py
@@ -16,7 +16,7 @@
 def apply_tokenizer_strategy(
     text: str,
     mapping: Mapping[str, List[str]],
-    tokenizer_name: str = "google/gemma-3-1b-pt",
+    tokenizer_name: str = "google/gemma-3-1b-it",
     **kwargs,
 ) -> str:
     """