Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 58 additions & 1 deletion nemo/collections/common/tokenizers/text_to_speech/ipa_lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

# fmt: off

SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN"]
SUPPORTED_LOCALES = ["en-US", "de-DE", "es-ES", "it-IT", "fr-FR", "vi-VN", "ja-JP", "hi-IN", "kn-IN"]

DEFAULT_PUNCTUATION = (
',', '.', '!', '?', '-',
Expand Down Expand Up @@ -107,6 +107,31 @@
# Danda (period)
'।',
),
"kn-IN": (
# Independent Vowels (Swaras)
'ಅ', 'ಆ', 'ಇ', 'ಈ', 'ಉ', 'ಊ', 'ಋ', 'ೠ', 'ಌ', 'ೡ',
'ಎ', 'ಏ', 'ಐ', 'ಒ', 'ಓ', 'ಔ',
# Consonants (Vyanjanas)
# Velar
'ಕ', 'ಖ', 'ಗ', 'ಘ', 'ಙ',
# Palatal
'ಚ', 'ಛ', 'ಜ', 'ಝ', 'ಞ',
# Retroflex
'ಟ', 'ಠ', 'ಡ', 'ಢ', 'ಣ',
# Dental
'ತ', 'ಥ', 'ದ', 'ಧ', 'ನ',
# Labial
'ಪ', 'ಫ', 'ಬ', 'ಭ', 'ಮ',
# Approximants and others
'ಯ', 'ರ', 'ಱ', 'ಲ', 'ಳ', 'ೞ', 'ವ', 'ಶ', 'ಷ', 'ಸ', 'ಹ',
# Dependent Vowel Signs (Matras)
'ಾ', 'ಿ', 'ೀ', 'ು', 'ೂ', 'ೃ', 'ೄ', 'ೆ', 'ೇ', 'ೈ', 'ೊ', 'ೋ', 'ೌ',
# Various Signs
'ಂ', # Anusvara
'ಃ', # Visarga
'್', # Virama (Halant)
'ಽ', # Avagraha
),
}

IPA_CHARACTER_SETS = {
Expand Down Expand Up @@ -183,6 +208,30 @@
'ɡ', 'ɣ', 'ɪ', 'ɭ', 'ɲ', 'ɳ', 'ɾ', 'ʂ', 'ʃ', 'ʈ',
'ʊ', 'ʋ', 'ʌ', 'ʰ', 'ː', '̃', '̩', 'χ',
),
# Kannada IPA phoneme set (split form - all modifiers as separate tokens)
"kn-IN": (
# Vowels (monophthongs) - base forms only
'a', 'i', 'u', 'e', 'o',
'ə', # schwa (inherent vowel, sometimes realized)
'ɯ', # close back unrounded vowel (for vocalic R: ಋ, ೃ)
# Consonants - Stops (base forms only)
'k', 'g', # Velar
'ʈ', 'ɖ', # Retroflex
't', 'd', # Dental
'p', 'b', # Labial
# Nasals
'ŋ', 'ɲ', 'ɳ', 'n', 'm',
# Approximants
'j', 'ʋ', 'w',
# Liquids
'r', 'ɾ', 'l', 'ɭ', 'ɻ',
# Fricatives/Affricates (ʃ, ʒ used in affricates tʃ, dʒ)
'ʃ', 'ʒ', 'ʂ', 's', 'h',
# Modifiers (separate tokens, like Hindi/Japanese)
'ʰ', # Aspiration marker
'ː', # Length marker
'̃', # Nasalization (combining tilde)
),
}

GRAPHEME_CHARACTER_CASES = ["upper", "lower", "mixed"]
Expand Down Expand Up @@ -347,5 +396,13 @@ def get_ipa_punctuation_list(locale):
'・',
]
)
elif locale == "kn-IN":
# Kannada punctuation
punct_set.update(
[
'।', # Devanagari Danda (single)
'॥', # Devanagari Double Danda
]
)
punct_list = sorted(list(punct_set))
return punct_list
Loading
Loading