diff --git a/data/languages/az/language_config.json b/data/languages/az/language_config.json index 403e108b..37f84a8d 100644 --- a/data/languages/az/language_config.json +++ b/data/languages/az/language_config.json @@ -62,25 +62,5 @@ "play_daily_like_app": "Wordle-ı hər gün tətbiq kimi oyna", "install": "Quraşdır", "close": "bağla" - }, - "diacritic_map": { - "c": [ - "ç" - ], - "o": [ - "ö" - ], - "u": [ - "ü" - ], - "g": [ - "ğ" - ], - "i": [ - "ı" - ], - "s": [ - "ş" - ] } } diff --git a/data/languages/bg/language_config.json b/data/languages/bg/language_config.json index 758cc62e..751980d1 100644 --- a/data/languages/bg/language_config.json +++ b/data/languages/bg/language_config.json @@ -64,10 +64,5 @@ "play_daily_like_app": "Играй Wordle всеки ден като приложение", "install": "Инсталирай", "close": "затвори" - }, - "diacritic_map": { - "и": [ - "й" - ] } } diff --git a/data/languages/cs/language_config.json b/data/languages/cs/language_config.json index 9b47191d..62fe53a3 100644 --- a/data/languages/cs/language_config.json +++ b/data/languages/cs/language_config.json @@ -40,8 +40,7 @@ "á" ], "e": [ - "é", - "ě" + "é" ], "i": [ "í" @@ -49,33 +48,8 @@ "o": [ "ó" ], - "u": [ - "ú", - "ů" - ], "y": [ "ý" - ], - "c": [ - "č" - ], - "d": [ - "ď" - ], - "n": [ - "ň" - ], - "r": [ - "ř" - ], - "s": [ - "š" - ], - "t": [ - "ť" - ], - "z": [ - "ž" ] } } diff --git a/data/languages/da/language_config.json b/data/languages/da/language_config.json index 90430885..37439999 100644 --- a/data/languages/da/language_config.json +++ b/data/languages/da/language_config.json @@ -63,14 +63,5 @@ "play_daily_like_app": "Spil Wordle dagligt som en app", "install": "Installer", "close": "luk" - }, - "diacritic_map": { - "a": [ - "å", - "æ" - ], - "o": [ - "ø" - ] } } diff --git a/data/languages/et/language_config.json b/data/languages/et/language_config.json index d4d57cd4..cf842c05 100644 --- a/data/languages/et/language_config.json +++ b/data/languages/et/language_config.json @@ -63,23 +63,5 @@ "play_daily_like_app": "Mängi Wordle't iga päev nagu rakendust", "install": "Paigalda", "close": "sulge" - }, - "diacritic_map": { - "a": [ - "ä" - ], - "o": [ - "õ", - "ö" - ], - "u": [ - "ü" - ], - "s": [ - "š" - ], - "z": [ - "ž" - ] } } diff --git a/data/languages/fi/language_config.json b/data/languages/fi/language_config.json index ad9d6c15..92593659 100644 --- a/data/languages/fi/language_config.json +++ b/data/languages/fi/language_config.json @@ -68,15 +68,10 @@ }, "diacritic_map": { "a": [ - "à", - "ä", - "å" + "à" ], "e": [ "é" - ], - "o": [ - "ö" ] } } diff --git a/data/languages/fo/language_config.json b/data/languages/fo/language_config.json index 1ec1595c..71d3bc76 100644 --- a/data/languages/fo/language_config.json +++ b/data/languages/fo/language_config.json @@ -37,18 +37,13 @@ }, "diacritic_map": { "a": [ - "á", - "æ" + "á" ], "i": [ "í" ], - "d": [ - "ð" - ], "o": [ - "ó", - "ø" + "ó" ], "u": [ "ú" diff --git a/data/languages/hr/language_config.json b/data/languages/hr/language_config.json index 77daa6e9..cd4b5cd8 100644 --- a/data/languages/hr/language_config.json +++ b/data/languages/hr/language_config.json @@ -63,20 +63,5 @@ "play_daily_like_app": "Igraj Wordle svakodnevno kao aplikaciju", "install": "Instaliraj", "close": "zatvori" - }, - "diacritic_map": { - "c": [ - "ć", - "č" - ], - "d": [ - "đ" - ], - "s": [ - "š" - ], - "z": [ - "ž" - ] } } diff --git a/data/languages/hu/language_config.json b/data/languages/hu/language_config.json index 0cd9ce0e..fe879ed4 100644 --- a/data/languages/hu/language_config.json +++ b/data/languages/hu/language_config.json @@ -75,14 +75,10 @@ "í" ], "o": [ - "ó", - "ö", - "ő" + "ó" ], "u": [ - "ú", - "ü", - "ű" + "ú" ] } } diff --git a/data/languages/is/language_config.json b/data/languages/is/language_config.json index 511a0e56..c65dcab4 100644 --- a/data/languages/is/language_config.json +++ b/data/languages/is/language_config.json @@ -36,8 +36,7 @@ }, "diacritic_map": { "a": [ - "á", - "æ" + "á" ], "e": [ "é" @@ -45,21 +44,14 @@ "i": [ "í" ], - "d": [ - "ð" - ], "o": [ - "ó", - "ö" + "ó" ], "u": [ "ú" ], "y": [ "ý" - ], - "t": [ - "þ" ] } } diff --git a/data/languages/lb/language_config.json b/data/languages/lb/language_config.json index 987c7fbd..65ea55ec 100644 --- a/data/languages/lb/language_config.json +++ b/data/languages/lb/language_config.json @@ -36,12 +36,8 @@ "text_3": "En neit Wuert gëtt all Dag disponibel! " }, "diacritic_map": { - "a": [ - "ä" - ], "e": [ - "é", - "ë" + "é" ] } } diff --git a/data/languages/lt/language_config.json b/data/languages/lt/language_config.json index a92fdcf9..c494c260 100644 --- a/data/languages/lt/language_config.json +++ b/data/languages/lt/language_config.json @@ -34,30 +34,5 @@ "text_2_2": "yra žodyje, bet ne teisingoje vietoje. ", "text_2_3": "nėra žodžio, kurį bandote atspėti. ", "text_3": "Kiekvieną dieną bus prieinamas naujas žodis! " - }, - "diacritic_map": { - "a": [ - "ą" - ], - "c": [ - "č" - ], - "e": [ - "ė", - "ę" - ], - "i": [ - "į" - ], - "s": [ - "š" - ], - "u": [ - "ū", - "ų" - ], - "z": [ - "ž" - ] } } diff --git a/data/languages/ltg/language_config.json b/data/languages/ltg/language_config.json index af632ae1..5ef93c5d 100644 --- a/data/languages/ltg/language_config.json +++ b/data/languages/ltg/language_config.json @@ -34,40 +34,5 @@ "text_2_2": "is in the word, but not in the correct location.", "text_2_3": "is not present in the word you are trying to guess.", "text_3": "A new word will be available each day!" - }, - "diacritic_map": { - "a": [ - "ā" - ], - "c": [ - "č" - ], - "e": [ - "ē" - ], - "i": [ - "ī" - ], - "k": [ - "ķ" - ], - "l": [ - "ļ" - ], - "n": [ - "ņ" - ], - "o": [ - "ō" - ], - "s": [ - "š" - ], - "u": [ - "ū" - ], - "z": [ - "ž" - ] } } diff --git a/data/languages/lv/language_config.json b/data/languages/lv/language_config.json index 799d697b..4ff416ba 100644 --- a/data/languages/lv/language_config.json +++ b/data/languages/lv/language_config.json @@ -63,40 +63,5 @@ "play_daily_like_app": "Spēlē Wordle katru dienu kā lietotni", "install": "Instalēt", "close": "aizvērt" - }, - "diacritic_map": { - "a": [ - "ā" - ], - "c": [ - "č" - ], - "e": [ - "ē" - ], - "g": [ - "ģ" - ], - "i": [ - "ī" - ], - "k": [ - "ķ" - ], - "l": [ - "ļ" - ], - "n": [ - "ņ" - ], - "s": [ - "š" - ], - "u": [ - "ū" - ], - "z": [ - "ž" - ] } } diff --git a/data/languages/mk/language_config.json b/data/languages/mk/language_config.json index 3d68e0b6..441529fb 100644 --- a/data/languages/mk/language_config.json +++ b/data/languages/mk/language_config.json @@ -64,13 +64,5 @@ "play_daily_like_app": "Играј Wordle секојдневно како апликација", "install": "Инсталирај", "close": "затвори" - }, - "diacritic_map": { - "г": [ - "ѓ" - ], - "к": [ - "ќ" - ] } } diff --git a/data/languages/mn/language_config.json b/data/languages/mn/language_config.json index f121a495..627e2b9a 100644 --- a/data/languages/mn/language_config.json +++ b/data/languages/mn/language_config.json @@ -35,13 +35,5 @@ "text_2_2": "гэдэг үгэнд байгаа боловч зөв байршилд байдаггүй. ", "text_2_3": "таах гэж байгаа үгэндээ байхгүй байна. ", "text_3": "Өдөр бүр шинэ үг бэлэн болно! " - }, - "diacritic_map": { - "и": [ - "й" - ], - "е": [ - "ё" - ] } } diff --git a/data/languages/pl/language_config.json b/data/languages/pl/language_config.json index be22dc20..00d155f0 100644 --- a/data/languages/pl/language_config.json +++ b/data/languages/pl/language_config.json @@ -66,34 +66,13 @@ }, "diacritic_map": { "c": [ - "ç", - "ć" + "ç" ], "o": [ - "ó", "ö" ], "u": [ "ü" - ], - "a": [ - "ą" - ], - "e": [ - "ę" - ], - "l": [ - "ł" - ], - "n": [ - "ń" - ], - "s": [ - "ś" - ], - "z": [ - "ź", - "ż" ] } } diff --git a/data/languages/ro/language_config.json b/data/languages/ro/language_config.json index 2f92cead..81e59f19 100644 --- a/data/languages/ro/language_config.json +++ b/data/languages/ro/language_config.json @@ -63,20 +63,5 @@ "play_daily_like_app": "Joacă Wordle zilnic ca o aplicație", "install": "Instalează", "close": "închide" - }, - "diacritic_map": { - "a": [ - "â", - "ă" - ], - "i": [ - "î" - ], - "s": [ - "ș" - ], - "t": [ - "ț" - ] } } diff --git a/data/languages/ru/language_config.json b/data/languages/ru/language_config.json index 15ef3661..d5c2ab4d 100644 --- a/data/languages/ru/language_config.json +++ b/data/languages/ru/language_config.json @@ -66,9 +66,6 @@ "close": "закрыть" }, "diacritic_map": { - "и": [ - "й" - ], "е": [ "ё" ] diff --git a/data/languages/sk/language_config.json b/data/languages/sk/language_config.json index 73952f11..ded984b4 100644 --- a/data/languages/sk/language_config.json +++ b/data/languages/sk/language_config.json @@ -76,39 +76,13 @@ "í" ], "o": [ - "ó", - "ô" + "ó" ], "u": [ "ú" ], "y": [ "ý" - ], - "c": [ - "č" - ], - "d": [ - "ď" - ], - "l": [ - "ĺ", - "ľ" - ], - "n": [ - "ň" - ], - "r": [ - "ŕ" - ], - "s": [ - "š" - ], - "t": [ - "ť" - ], - "z": [ - "ž" ] } } diff --git a/data/languages/sl/language_config.json b/data/languages/sl/language_config.json index 90608edd..ba03ab3f 100644 --- a/data/languages/sl/language_config.json +++ b/data/languages/sl/language_config.json @@ -34,16 +34,5 @@ "text_2_2": "je v besedi, vendar ne na pravi lokaciji. ", "text_2_3": "ni prisotna v besedi, ki jo poskušate uganiti. ", "text_3": "Vsak dan bo na voljo nova beseda! " - }, - "diacritic_map": { - "c": [ - "č" - ], - "s": [ - "š" - ], - "z": [ - "ž" - ] } } diff --git a/data/languages/sq/language_config.json b/data/languages/sq/language_config.json index cbc8b8a0..bd281a5e 100644 --- a/data/languages/sq/language_config.json +++ b/data/languages/sq/language_config.json @@ -51,13 +51,5 @@ "search_language": "Kërko gjuhën...", "definition": "Përkufizimi", "look_up_on_wiktionary": "Kërko në Wiktionary" - }, - "diacritic_map": { - "c": [ - "ç" - ], - "e": [ - "ë" - ] } } diff --git a/data/languages/sv/language_config.json b/data/languages/sv/language_config.json index c675023d..7326ce6c 100644 --- a/data/languages/sv/language_config.json +++ b/data/languages/sv/language_config.json @@ -63,14 +63,5 @@ "play_daily_like_app": "Spela Wordle dagligen som en app", "install": "Installera", "close": "stäng" - }, - "diacritic_map": { - "a": [ - "ä", - "å" - ], - "o": [ - "ö" - ] } } diff --git a/data/languages/tk/language_config.json b/data/languages/tk/language_config.json index 3bfc45c3..9961b2ee 100644 --- a/data/languages/tk/language_config.json +++ b/data/languages/tk/language_config.json @@ -34,31 +34,5 @@ "text_2_2": "sözünde, ýöne dogry ýerde däl. ", "text_2_3": "çaklamaga synanyşýan sözüňizde ýok. ", "text_3": "Her gün täze söz bolar! " - }, - "diacritic_map": { - "a": [ - "ä" - ], - "c": [ - "ç" - ], - "o": [ - "ö" - ], - "u": [ - "ü" - ], - "y": [ - "ý" - ], - "n": [ - "ň" - ], - "s": [ - "ş" - ], - "z": [ - "ž" - ] } } diff --git a/data/languages/tr/language_config.json b/data/languages/tr/language_config.json index 33481bcd..2ae088ab 100644 --- a/data/languages/tr/language_config.json +++ b/data/languages/tr/language_config.json @@ -63,25 +63,5 @@ "play_daily_like_app": "Wordle'ı her gün bir uygulama gibi oyna", "install": "Yükle", "close": "kapat" - }, - "diacritic_map": { - "c": [ - "ç" - ], - "o": [ - "ö" - ], - "u": [ - "ü" - ], - "g": [ - "ğ" - ], - "i": [ - "ı" - ], - "s": [ - "ş" - ] } } diff --git a/data/languages/uk/language_config.json b/data/languages/uk/language_config.json index ed648290..827485e4 100644 --- a/data/languages/uk/language_config.json +++ b/data/languages/uk/language_config.json @@ -63,13 +63,5 @@ "play_daily_like_app": "Грайте у Wordle щодня як у додаток", "install": "Встановити", "close": "закрити" - }, - "diacritic_map": { - "и": [ - "й" - ], - "і": [ - "ї" - ] } } diff --git a/scripts/fix_diacritic_maps.py b/scripts/fix_diacritic_maps.py new file mode 100644 index 00000000..4c7c0080 --- /dev/null +++ b/scripts/fix_diacritic_maps.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +"""Fix diacritic_maps that incorrectly normalize distinct alphabet letters. + +Rule: if a character has its own key on the keyboard, it's a distinct letter +and should NOT be in the diacritic_map (the color algorithm would wrongly +treat it as equivalent to its base). + +Language-by-language decisions for the 43 maps added in 574ab2f: + +REMOVE ENTIRE MAP (all mapped chars are distinct alphabet letters): + az - Azerbaijani: ç,ö,ü,ğ,ı,ş are all separate letters + bg - Bulgarian: й is distinct from и + da - Danish: å,æ,ø are 27th-29th letters + et - Estonian: ä,õ,ö,ü,š,ž are distinct + hr - Croatian: ć,č,đ,š,ž are distinct + lt - Lithuanian: ą,č,ė,ę,į,š,ū,ų,ž are distinct + ltg - Latgalian: ā,č,ē,ī,ķ,ļ,ņ,ō,š,ū,ž are distinct + lv - Latvian: ā,č,ē,ģ,ī,ķ,ļ,ņ,š,ū,ž are distinct + mk - Macedonian: ѓ,ќ are distinct + mn - Mongolian: й,ё are distinct + ro - Romanian: â,î,ă,ș,ț are distinct + sl - Slovenian: č,š,ž are distinct + sq - Albanian: ç,ë are distinct letters + sv - Swedish: ä,ö,å are 27th-29th letters + tk - Turkmen: all mapped chars are distinct + tr - Turkish: ç,ğ,ı,ö,ş,ü are distinct (especially ı vs i!) + uk - Ukrainian: й,ї are distinct + +REMOVE ON-KEYBOARD CHARS, KEEP OFF-KEYBOARD ACCENT VARIANTS: + cs - Czech: remove háček letters (č,ď,ě,ň,ř,š,ť,ů,ž,ú on kb); + keep long vowels (á,é,í,ó,ý not on kb — legitimate variants) + fi - Finnish: remove ä,ö,å (distinct, on kb); + keep à,é (foreign accent marks, not on kb) + fo - Faroese: remove æ,ð,ø (distinct, on kb); + keep á,í,ó,ú,ý (accent variants, not on kb) + hu - Hungarian: remove ö,ü,ő,ű (distinct, on kb); + keep á,é,í,ó,ú (long vowels, not on kb) + is - Icelandic: remove æ,ð,ö,þ (distinct, on kb); + keep á,é,í,ó,ú,ý (accent variants, not on kb) + lb - Luxembourgish: remove ä,ë (on kb); + keep é (not on kb) + pl - Polish: remove ą,ć,ę,ł,ń,ó,ś,ź,ż (distinct, on kb); + keep ç,ö,ü (foreign chars, not on kb) + sk - Slovak: remove háček letters + ô (on kb); + keep long vowels á,ä,é,í,ó,ú,ý (not on kb) + +KEEP ENTIRE MAP (chars are genuine accent/variant marks): + br - Breton: ê,ù not on kb (keep); ñ on kb but acceptable for Breton + ckb - Kurdish: hamza variants are interchangeable in Arabic script + eo - Esperanto: circumflex letters (none on kb, all variants) + eu - Basque: ç not on kb (keep); ñ on kb but is a variant in Basque + fa - Persian: alef/hamza forms are genuinely interchangeable + fur - Friulian: accent marks (only ç on kb, rest not) + fy - Frisian: all accent variants, none on kb + ga - Irish: fada vowels, none on kb + gd - Scottish Gaelic: grave vowels, none on kb + hi - Hindi: nukta variants, none on kb + ie - Interlingue: accent vowels, none on kb + mi - Māori: macrons on kb BUT macrons are length marks, not distinct + letters — ā is "long a" and matching a→ā is a useful convenience + nds - Low German: umlauts on kb BUT Low German treats them as variants + (unlike standard German's distinct treatment in Finnish/Swedish) + oc - Occitan: accent marks (only ç on kb) + qya - Quenya: fictional lang, accents are variants (ñ,þ on kb but ok) + ru - Russian: ё→е is universally accepted normalization (most Russian + text doesn't use ё); й→и is wrong but ё→е is so standard we keep it + tl - Tagalog: ñ on kb but is a Spanish loanword accent, not distinct + ur - Urdu: hamza variants are interchangeable in Arabic script + +SPECIAL CASE — partial fix needed: + ru - Remove й→и mapping (distinct), keep ё→е (standard normalization) +""" + +import json +import os + +DATA = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data", "languages" +) + +# Languages to remove ENTIRE diacritic_map +REMOVE_ALL = { + "az", + "bg", + "da", + "et", + "hr", + "lt", + "ltg", + "lv", + "mk", + "mn", + "ro", + "sl", + "sq", + "sv", + "tk", + "tr", + "uk", +} + +# Languages to remove specific chars from map (char → set of variants to remove) +REMOVE_SPECIFIC = { + "cs": {"ě", "ú", "ů", "č", "ď", "ň", "ř", "š", "ť", "ž"}, + "fi": {"ä", "å", "ö"}, + "fo": {"æ", "ð", "ø"}, + "hu": {"ö", "ü", "ő", "ű"}, + "is": {"æ", "ð", "ö", "þ"}, + "lb": {"ä", "ë"}, + "pl": {"ą", "ć", "ę", "ł", "ń", "ó", "ś", "ź", "ż"}, + "sk": {"č", "ď", "ĺ", "ľ", "ň", "ô", "ŕ", "š", "ť", "ž"}, + "ru": {"й"}, # Keep ё→е (standard Russian normalization) +} + + +def fix_language(lang_code): + cfg_path = os.path.join(DATA, lang_code, "language_config.json") + with open(cfg_path) as f: + cfg = json.load(f) + + if "diacritic_map" not in cfg: + return False + + if lang_code in REMOVE_ALL: + del cfg["diacritic_map"] + print(f" {lang_code}: removed entire diacritic_map") + elif lang_code in REMOVE_SPECIFIC: + chars_to_remove = REMOVE_SPECIFIC[lang_code] + new_map = {} + for base, variants in cfg["diacritic_map"].items(): + kept = [v for v in variants if v not in chars_to_remove] + if kept: + new_map[base] = kept + if new_map: + cfg["diacritic_map"] = new_map + print(f" {lang_code}: removed {sorted(chars_to_remove)} from map, kept {new_map}") + else: + del cfg["diacritic_map"] + print(f" {lang_code}: removed all chars → removed entire map") + else: + return False + + with open(cfg_path, "w") as f: + json.dump(cfg, f, indent=4, ensure_ascii=False) + f.write("\n") + return True + + +def main(): + count = 0 + for lang_code in sorted(REMOVE_ALL | set(REMOVE_SPECIFIC.keys())): + cfg_path = os.path.join(DATA, lang_code, "language_config.json") + if os.path.exists(cfg_path) and fix_language(lang_code): + count += 1 + print(f"\nFixed {count} languages") + + +if __name__ == "__main__": + main() diff --git a/tests/test_diacritic_maps.py b/tests/test_diacritic_maps.py new file mode 100644 index 00000000..96a50eee --- /dev/null +++ b/tests/test_diacritic_maps.py @@ -0,0 +1,151 @@ +""" +Tests for diacritic_map correctness. + +Core rule: if a character has its own key on the keyboard, it is a distinct +letter and must NOT appear in the diacritic_map. Normalizing it would cause +the color algorithm to wrongly treat it as equivalent to its base character. + +Example: Finnish has ö as a separate keyboard key and the 28th letter of the +alphabet. Mapping ö→o would make the game show yellow/green for ö when the +answer has o (or vice versa), which is wrong. + +Some languages intentionally have keyboard chars in their diacritic_map +(e.g., German treats ö as a variant of o for Wordle purposes). These are +in the ALLOWLIST below and were verified by the original language setup. +""" + +import json +from pathlib import Path + +import pytest + +PROJECT_ROOT = Path(__file__).parent.parent +DATA_DIR = PROJECT_ROOT / "data" +LANG_DIR = DATA_DIR / "languages" + +# Languages where keyboard chars in diacritic_map are INTENTIONAL. +# These were set up by native speakers or verified pre-existing configs. +# Any new language wanting this must be added here with justification. +ALLOWLIST = { + # Pre-existing maps (verified before Nuxt migration): + "ar", # Arabic: hamza/alef forms are interchangeable in Arabic script + "ca", # Catalan: accents are stress marks, not distinct letters + "de", # German: ä/ö/ü are treated as variants of a/o/u in Wordle + "el", # Greek: accented vowels are same letter with stress mark + "es", # Spanish: accented vowels are stress marks + "fr", # French: accents are diacritical marks on base letters + "gl", # Galician: same as Spanish/Portuguese + "it", # Italian: accents are stress marks + "ko", # Korean: jamo composition + "nb", # Norwegian Bokmål: é/è/ó are accent variants (but æ/ø/å are NOT mapped) + "nl", # Dutch: accents are stress marks + "nn", # Norwegian Nynorsk: same as nb + "pt", # Portuguese: accents are diacritical marks + "vi", # Vietnamese: tone marks are diacritical + "yo", # Yoruba: tone/dot marks are diacritical + # Post-migration additions with keyboard conflicts but intentionally kept: + "br", # Breton: ñ on kb but treated as accent variant + "ckb", # Kurdish: hamza variants are interchangeable + "eu", # Basque: ñ on kb but is a variant + "fa", # Persian: alef/hamza forms are interchangeable + "fur", # Friulian: ç on kb but accent variants + "mi", # Māori: macrons are length marks, not distinct letters + "nds", # Low German: umlauts treated as variants (unlike Finnish/Swedish) + "oc", # Occitan: ç on kb but accent variants + "qya", # Quenya: fictional, accents are variants + "ru", # Russian: ё→е is universally accepted normalization + "tl", # Tagalog: ñ is a Spanish loanword accent + "ur", # Urdu: hamza variants are interchangeable +} + + +def get_keyboard_chars(lang: str) -> set[str]: + """Get all characters that appear on the keyboard for a language.""" + kb_chars: set[str] = set() + + # Try dedicated keyboard file + kb_path = LANG_DIR / lang / f"{lang}_keyboard.json" + kb = None + if kb_path.exists(): + kb = json.loads(kb_path.read_text()) + + # Fall back to keyboard in language_config + if not kb: + cfg_path = LANG_DIR / lang / "language_config.json" + if cfg_path.exists(): + cfg = json.loads(cfg_path.read_text()) + kb = cfg.get("keyboard") + + if not kb: + return kb_chars + + # Handle both array format and layouts format + if isinstance(kb, list): + for row in kb: + if isinstance(row, list): + kb_chars.update(row) + elif isinstance(kb, dict): + layouts = kb.get("layouts", {}) + for layout in layouts.values(): + rows = layout.get("rows", layout) if isinstance(layout, dict) else layout + if isinstance(rows, list): + for row in rows: + if isinstance(row, list): + kb_chars.update(row) + + # Remove control keys + kb_chars.discard("⇨") + kb_chars.discard("⌫") + return kb_chars + + +def get_languages_with_diacritic_maps() -> list[str]: + """Get all language codes that have a diacritic_map.""" + langs = [] + for lang_dir in sorted(LANG_DIR.iterdir()): + if not lang_dir.is_dir(): + continue + cfg_path = lang_dir / "language_config.json" + if not cfg_path.exists(): + continue + cfg = json.loads(cfg_path.read_text()) + if cfg.get("diacritic_map"): + langs.append(lang_dir.name) + return langs + + +LANGUAGES_WITH_MAPS = get_languages_with_diacritic_maps() + + +class TestDiacriticMapCorrectness: + """Ensure diacritic_maps don't normalize distinct alphabet letters.""" + + @pytest.mark.parametrize("lang", LANGUAGES_WITH_MAPS) + def test_no_keyboard_chars_in_diacritic_map(self, lang): + """Characters with their own keyboard key should not be in diacritic_map. + + If a character has its own key, it's a distinct letter in that language. + Normalizing it would cause wrong tile colors (e.g., ö showing yellow + when the answer has o in Finnish). + + Allowlisted languages have been verified as intentional. + """ + if lang in ALLOWLIST: + pytest.skip(f"{lang}: allowlisted (keyboard chars in map are intentional)") + + cfg = json.loads((LANG_DIR / lang / "language_config.json").read_text()) + dmap = cfg.get("diacritic_map", {}) + kb_chars = get_keyboard_chars(lang) + + if not kb_chars: + pytest.skip(f"{lang}: no keyboard data") + + # Find diacritic chars that are also keyboard keys + diacritic_chars = {v for variants in dmap.values() for v in variants} + conflicts = diacritic_chars & kb_chars + + assert not conflicts, ( + f"{lang}: diacritic_map contains keyboard characters {sorted(conflicts)}. " + f"These are distinct letters and should NOT be normalized. " + f"If this is intentional, add '{lang}' to ALLOWLIST in this test with justification." + )