diff --git a/Jenkinsfile b/Jenkinsfile index fd7a2c988..81a259a32 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -10,7 +10,6 @@ pipeline { disableConcurrentBuilds(abortPrevious: true) } environment { - AR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-24-24-0' DE_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-23-24-0' EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-25-25-0' @@ -27,7 +26,7 @@ pipeline { HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-22-25-0' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/01-16-26-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/address/__init__.py b/nemo_text_processing/text_normalization/hi/data/address/__init__.py new file mode 100644 index 000000000..4fc25d0d3 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/address/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/data/address/cities.tsv b/nemo_text_processing/text_normalization/hi/data/address/cities.tsv new file mode 100644 index 000000000..0199bf0cb --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/address/cities.tsv @@ -0,0 +1,36 @@ +अमरावती +ईटानगर +दिसपुर +पटना +रायपुर +पणजी +गांधीनगर +चंडीगढ़ +शिमला +रांची +बेंगलुरु +तिरुवनंतपुरम +भोपाल +मुंबई +इम्फाल +शिलांग +आइजोल +कोहिमा +भुवनेश्वर +जयपुर +गंगटोक +चेन्नई +हैदराबाद +अगरतला +लखनऊ +देहरादून +कोलकाता +पोर्ट ब्लेयर +दमन +नई दिल्ली +श्रीनगर +जम्मू +लेह +कारगिल +कवरत्ती +पुडुचेरी diff --git a/nemo_text_processing/text_normalization/hi/data/address/context.tsv b/nemo_text_processing/text_normalization/hi/data/address/context.tsv new file mode 100644 index 000000000..9faadaa3b --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/address/context.tsv @@ -0,0 +1,48 @@ +हाउस +प्लॉट +बूथ +अपार्टमेंट +फ्लैट +यूनिट +टावर +कॉम्प्लेक्स +मंजिल +फ्लोर +ब्लॉक +सेक्टर +फेज +रोड +सड़क +मार्ग +स्ट्रीट +गली +राजमार्ग +ड्राइव +डिस्ट्रिक्ट +बाईपास +हाइवे +पार्कवे +कॉलोनी +नगर +पार्क +एस्टेट +बोलवार्ड +मार्केट +सेंटर +पिन +गांव +पास +ब्रिगेड +नियर +स्क्वेर +मॉल +टॉवर +इंस्टीट्यूट +पिलर +मेट्रो +एवेन्यू +वेस्ट +सामने +पीछे +वीया +आर डी \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/address/en_to_hi_mapping.tsv b/nemo_text_processing/text_normalization/hi/data/address/en_to_hi_mapping.tsv new file mode 100644 index 000000000..15929b547 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/address/en_to_hi_mapping.tsv @@ -0,0 +1,2 @@ +street स्ट्रीट +southern सदर्न \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/address/letters.tsv b/nemo_text_processing/text_normalization/hi/data/address/letters.tsv new file mode 100644 index 000000000..68889ca3f --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/address/letters.tsv @@ -0,0 +1,26 @@ +A ए +B बी +C सी +D डी +E ई +F एफ +G जी +H एच +I आई +J जे +K के +L एल +M एम +N एन +O ओ +P पी +Q क्यू +R आर +S एस +T टी +U यू +V वी +W डब्ल्यू +X एक्स +Y वाई +Z ज़ेड \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/address/special_characters.tsv b/nemo_text_processing/text_normalization/hi/data/address/special_characters.tsv new file mode 100644 index 000000000..ca5b068bd --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/address/special_characters.tsv @@ -0,0 +1,2 @@ +- हाइफ़न +/ बटा \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/address/states.tsv b/nemo_text_processing/text_normalization/hi/data/address/states.tsv new file mode 100644 index 000000000..1e2b6c358 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/address/states.tsv @@ -0,0 +1,36 @@ +आंध्र प्रदेश +अरुणाचल प्रदेश +असम +बिहार +छत्तीसगढ़ +गोवा +गुजरात +हरियाणा +हिमाचल प्रदेश +झारखंड +कर्नाटक +केरल +मध्य प्रदेश +महाराष्ट्र +मणिपुर +मेघालय +मिज़ोरम +नागालैंड +ओडिशा +पंजाब +राजस्थान +सिक्किम +तमिलनाडु +तेलंगाना +त्रिपुरा +उत्तर प्रदेश +उत्तराखंड +पश्चिम बंगाल +अंडमान और निकोबार द्वीप समूह +चंडीगढ़ +दादरा और नगर हवेली और दमन और दीव +दिल्ली +जम्मू और कश्मीर +लद्दाख +लक्षद्वीप +पुडुचेरी diff --git a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_list.tsv b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_list.tsv new file mode 100644 index 000000000..6fcfb8b3a --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_list.tsv @@ -0,0 +1,5 @@ +हफ़्ते +सप्ताह +सदियां +सदियों + diff --git a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_map.tsv similarity index 77% rename from nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv rename to nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_map.tsv index eaddf930a..e190a80ef 100644 --- a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv +++ b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units_map.tsv @@ -4,9 +4,7 @@ h घंटे min मिनट doz दर्जन yr साल -yr वर्ष hp हॉर्सपॉवर d दिन month महीना months महीने -हफ़्ते हफ़्ते \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv index 189512687..4065bc86b 100644 --- a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv +++ b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv @@ -134,7 +134,6 @@ KHz किलोहर्ट्ज़ N न्यूटन dB डेसीबल yr साल -yr वर्ष hp हॉर्सपॉवर d दिन month महीना diff --git a/nemo_text_processing/text_normalization/hi/data/measure/unit_year_formal.tsv b/nemo_text_processing/text_normalization/hi/data/measure/unit_year_formal.tsv new file mode 100644 index 000000000..a3c7b2162 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/measure/unit_year_formal.tsv @@ -0,0 +1 @@ +yr वर्ष diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/__init__.py b/nemo_text_processing/text_normalization/hi/data/ordinal/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/en_to_hi_digit.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/en_to_hi_digit.tsv new file mode 100644 index 000000000..a89e99b3c --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/en_to_hi_digit.tsv @@ -0,0 +1,10 @@ +0 ० +1 १ +2 २ +3 ३ +4 ४ +5 ५ +6 ६ +7 ७ +8 ८ +9 ९ diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/exceptions.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/exceptions.tsv new file mode 100644 index 000000000..26a5efc1b --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/exceptions.tsv @@ -0,0 +1,25 @@ +१ला पहला +१ली पहली +२रा दूसरा +२री दूसरी +३रा तीसरा +३री तीसरी +४था चौथा +४थी चौथी +६ठा छठा +६ठी छठी +१st फ़र्स्ट +२nd सेकंड +३rd थर्ड +४th फ़ोर्थ +५th फ़िफ्थ +६th सिक्स्थ +७th सेवंथ +८th एटथ +९th नाइंथ +१०th टेंथ +११th इलेवंथ +१२th ट्वेल्फ्थ +१३th थर्टींथ +१४th फोर्टींथ +१५th फिफ्टींथ diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv new file mode 100644 index 000000000..922e9d6b8 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes.tsv @@ -0,0 +1,3 @@ +वां +वीं +वें diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv new file mode 100644 index 000000000..77139cff5 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv @@ -0,0 +1,2 @@ +वे वें + diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/__init__.py b/nemo_text_processing/text_normalization/hi/data/telephone/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/credit_context.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/credit_context.tsv new file mode 100644 index 000000000..46b485af6 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/credit_context.tsv @@ -0,0 +1,3 @@ +नंबर +कार्ड +क्रेडिट \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/landline_context.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/landline_context.tsv new file mode 100644 index 000000000..17a123bee --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/landline_context.tsv @@ -0,0 +1,5 @@ +नंबर +मोबाइल +फोन +लैंडलाइन +कॉल \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/mobile_context.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/mobile_context.tsv new file mode 100644 index 000000000..f2fa6e52f --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/mobile_context.tsv @@ -0,0 +1,4 @@ +नंबर +मोबाइल +फोन +कॉल \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/number.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/number.tsv new file mode 100644 index 000000000..e8c04b723 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/number.tsv @@ -0,0 +1,10 @@ +0 शून्य +1 एक +2 दो +3 तीन +4 चार +5 पाँच +6 छह +7 सात +8 आठ +9 नौ \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/telephone/pincode_context.tsv b/nemo_text_processing/text_normalization/hi/data/telephone/pincode_context.tsv new file mode 100644 index 000000000..322c7248e --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/telephone/pincode_context.tsv @@ -0,0 +1,4 @@ +नंबर +पिन +कोड +पिनकोड \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/whitelist/paune_mappings.tsv b/nemo_text_processing/text_normalization/hi/data/whitelist/paune_mappings.tsv new file mode 100644 index 000000000..3477871e4 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/whitelist/paune_mappings.tsv @@ -0,0 +1,100 @@ +० एक +१ दो +२ तीन +३ चार +४ पाँच +५ छह +६ सात +७ आठ +८ नौ +९ दस +१० ग्यारह +११ बारह +१२ तेरह +१३ चौदह +१४ पंद्रह +१५ सोलह +१६ सत्रह +१७ अठारह +१८ उन्नीस +१९ बीस +२० इक्कीस +२१ बाईस +२२ तेईस +२३ चौबीस +२४ पच्चीस +२५ छब्बीस +२६ सत्ताईस +२७ अट्ठाईस +२८ उनतीस +२९ तीस +३० इकतीस +३१ बत्तीस +३२ तैंतीस +३३ चौंतीस +३४ पैंतीस +३५ छत्तीस +३६ सैंतीस +३७ अड़तीस +३८ उनतालीस +३९ चालीस +४० इकतालीस +४१ बयालीस +४२ तैंतालीस +४३ चौवालीस +४४ पैंतालीस +४५ छियालीस +४६ सैंतालीस +४७ अड़तालीस +४८ उनचास +४९ पचास +५० इक्यावन +५१ बावन +५२ तिरेपन +५३ चौवन +५४ पचपन +५५ छप्पन +५६ सत्तावन +५७ अट्ठावन +५८ उनसठ +५९ साठ +६० इकसठ +६१ बासठ +६२ तिरेसठ +६३ चौंसठ +६४ पैंसठ +६५ छियासठ +६६ सड़सठ +६७ अड़सठ +६८ उनहत्तर +६९ सत्तर +७० इकहत्तर +७१ बहत्तर +७२ तिहत्तर +७३ चौहत्तर +७४ पचहत्तर +७५ छिहत्तर +७६ सतहत्तर +७७ अठहत्तर +७८ उनासी +७९ अस्सी +८० इक्यासी +८१ बयासी +८२ तिरासी +८३ चौरासी +८४ पचासी +८५ छियासी +८६ सत्तासी +८७ अट्ठासी +८८ नवासी +८९ नब्बे +९० इक्यानबे +९१ बानबे +९२ तिरानबे +९३ चौरानबे +९४ पंचानबे +९५ छियानबे +९६ सत्तानबे +९७ अट्ठानबे +९८ निन्यानबे +९९ एक सौ diff --git a/nemo_text_processing/text_normalization/hi/graph_utils.py b/nemo_text_processing/text_normalization/hi/graph_utils.py index 6a5d3c699..d498ae489 100644 --- a/nemo_text_processing/text_normalization/hi/graph_utils.py +++ b/nemo_text_processing/text_normalization/hi/graph_utils.py @@ -30,6 +30,31 @@ NEMO_HI_DIGIT = pynini.union("०", "१", "२", "३", "४", "५", "६", "७", "८", "९").optimize() NEMO_HI_NON_ZERO = pynini.union("१", "२", "३", "४", "५", "६", "७", "८", "९").optimize() NEMO_HI_ZERO = "०" + +HI_DEDH = "डेढ़" # 1.5 +HI_DHAI = "ढाई" # 2.5 +HI_SAVVA = "सवा" # quarter more (1.25) +HI_SADHE = "साढ़े" # half more (X.5) +HI_PAUNE = "पौने" # quarter less (0.75) + +# Hindi decimal representations +HI_POINT_FIVE = ".५" # .5 +HI_ONE_POINT_FIVE = "१.५" # 1.5 +HI_TWO_POINT_FIVE = "२.५" # 2.5 +HI_DECIMAL_25 = ".२५" # .25 +HI_DECIMAL_75 = ".७५" # .75 + +# Symbol constants +HI_BY = "बाई" +LOWERCASE_X = "x" +UPPERCASE_X = "X" +ASTERISK = "*" +HYPHEN = "-" +SLASH = "/" +COMMA = "," +PERIOD = "." +HI_PERIOD = "।" + NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py index c50384acf..eb4feaef1 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py @@ -15,18 +15,18 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_HI_DIGIT, GraphFst, insert_space from nemo_text_processing.text_normalization.hi.utils import get_abs_path class CardinalFst(GraphFst): """ - Finite state transducer for classifying cardinals, e.g. - -२३ -> cardinal { negative: "true" integer: "तेइस" } } - s - Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) + Finite state transducer for classifying cardinals, e.g. + -२३ -> cardinal { negative: "true" integer: "तेइस" } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) """ def __init__(self, deterministic: bool = True, lm: bool = False): @@ -37,6 +37,15 @@ def __init__(self, deterministic: bool = True, lm: bool = False): teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) teens_and_ties = pynutil.add_weight(teens_ties, -0.1) + self.digit = digit + self.zero = zero + self.teens_and_ties = teens_and_ties + + # Single digit graph for digit-by-digit reading + # e.g., "०७३" -> "शून्य सात तीन" + single_digit_graph = digit | zero + self.single_digits_graph = single_digit_graph + pynini.closure(insert_space + single_digit_graph) + def create_graph_suffix(digit_graph, suffix, zeros_counts): zero = pynutil.add_weight(pynutil.delete("०"), -0.1) if zeros_counts == 0: @@ -294,7 +303,8 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 0, graph_ten_padmas) graph_ten_shankhs.optimize() - final_graph = ( + # Graph without leading zeros - used by other taggers like ordinal, decimal and measure + graph_without_leading_zeros = ( digit | zero | teens_and_ties @@ -316,6 +326,17 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): | graph_shankhs | graph_ten_shankhs ) + self.graph_without_leading_zeros = graph_without_leading_zeros.optimize() + + # Handle numbers with leading zeros by reading digit-by-digit + # e.g., "०७३" -> "शून्य सात तीन", "००५" -> "शून्य शून्य पाँच" + cardinal_with_leading_zeros = pynini.compose( + pynini.accep("०") + pynini.closure(NEMO_HI_DIGIT), self.single_digits_graph + ) + cardinal_with_leading_zeros = pynutil.add_weight(cardinal_with_leading_zeros, 0.5) + + # Full graph including leading zeros - for standalone cardinal matching + final_graph = graph_without_leading_zeros | cardinal_with_leading_zeros optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 37b192165..b25abcac6 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -65,11 +65,11 @@ def __init__(self, cardinal: GraphFst): (NEMO_HI_DIGIT + NEMO_HI_NON_ZERO + NEMO_HI_DIGIT + NEMO_HI_DIGIT), cardinal.graph_hundreds_as_thousand ) - cardinal_graph = ( - digit | teens_and_ties | cardinal.graph_hundreds | graph_year_thousands | graph_year_hundreds_as_thousands + cardinal_graph = pynini.union( + digit, teens_and_ties, cardinal.graph_hundreds, graph_year_thousands, graph_year_hundreds_as_thousands ) - graph_year = graph_year_thousands | graph_year_hundreds_as_thousands + graph_year = pynini.union(graph_year_thousands, graph_year_hundreds_as_thousands) delete_dash = pynutil.delete("-") delete_slash = pynutil.delete("/") @@ -102,13 +102,10 @@ def __init__(self, cardinal: GraphFst): # Updated logic to use prefix_union year_prefix = pynutil.insert("era: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"") - graph_dd_mm_yyyy = ( - days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph - ) + delete_separator = pynini.union(delete_dash, delete_slash) + graph_dd_mm_yyyy = days_graph + delete_separator + months_graph + delete_separator + years_graph - graph_mm_dd_yyyy = ( - months_graph + (delete_dash | delete_slash) + days_graph + (delete_dash | delete_slash) + years_graph - ) + graph_mm_dd_yyyy = months_graph + delete_separator + days_graph + delete_separator + years_graph graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") diff --git a/nemo_text_processing/text_normalization/hi/taggers/decimal.py b/nemo_text_processing/text_normalization/hi/taggers/decimal.py index 955e8c0d3..7522de2bb 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/decimal.py @@ -58,10 +58,8 @@ class DecimalFst(GraphFst): def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) - graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) - graph_digit |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")) - - cardinal_graph = cardinal.final_graph + graph_digit = cardinal.digit | cardinal.zero + cardinal_graph = cardinal.graph_without_leading_zeros self.graph = graph_digit + pynini.closure(insert_space + graph_digit).optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/fraction.py b/nemo_text_processing/text_normalization/hi/taggers/fraction.py index 8971cd3dd..b5528deba 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/taggers/fraction.py @@ -15,7 +15,20 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.hi.graph_utils import ( + HI_DEDH, + HI_DHAI, + HI_PAUNE, + HI_SADHE, + HI_SAVVA, + NEMO_SPACE, + GraphFst, +) +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + +HI_ONE_HALF = "१/२" # 1/2 +HI_ONE_QUARTER = "१/४" # 1/4 +HI_THREE_QUARTERS = "३/४" # 3/4 class FractionFst(GraphFst): @@ -39,21 +52,76 @@ def __init__(self, cardinal, deterministic: bool = True): cardinal_graph = cardinal.final_graph self.optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1 + pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + pynutil.insert(NEMO_SPACE), 0, 1 ) self.integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") self.numerator = ( - pynutil.insert("numerator: \"") + cardinal_graph + pynini.cross(pynini.union("/", " / "), "\" ") + pynutil.insert("numerator: \"") + + cardinal_graph + + pynini.cross(pynini.union("/", NEMO_SPACE + "/" + NEMO_SPACE), "\"") + + pynutil.insert(NEMO_SPACE) ) self.denominator = pynutil.insert("denominator: \"") + cardinal_graph + pynutil.insert("\"") - self.graph = ( + dedh_dhai_graph = pynini.string_map( + [("१" + NEMO_SPACE + HI_ONE_HALF, HI_DEDH), ("२" + NEMO_SPACE + HI_ONE_HALF, HI_DHAI)] + ) + + savva_numbers = cardinal_graph + pynini.cross(NEMO_SPACE + HI_ONE_QUARTER, "") + savva_graph = pynutil.insert(HI_SAVVA) + pynutil.insert(NEMO_SPACE) + savva_numbers + + sadhe_numbers = cardinal_graph + pynini.cross(NEMO_SPACE + HI_ONE_HALF, "") + sadhe_graph = pynutil.insert(HI_SADHE) + pynutil.insert(NEMO_SPACE) + sadhe_numbers + + paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) + paune_numbers = paune + pynini.cross(NEMO_SPACE + HI_THREE_QUARTERS, "") + paune_graph = pynutil.insert(HI_PAUNE) + pynutil.insert(NEMO_SPACE) + paune_numbers + + graph_dedh_dhai = ( + pynutil.insert("morphosyntactic_features: \"") + + dedh_dhai_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + graph_savva = ( + pynutil.insert("morphosyntactic_features: \"") + + savva_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + graph_sadhe = ( + pynutil.insert("morphosyntactic_features: \"") + + sadhe_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + graph_paune = ( + pynutil.insert("morphosyntactic_features: \"") + + paune_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + final_graph = ( self.optional_graph_negative - + pynini.closure(self.integer + pynini.accep(" "), 0, 1) + + pynini.closure(self.integer + pynini.accep(NEMO_SPACE), 0, 1) + self.numerator + self.denominator ) + weighted_graph = ( + final_graph + | pynutil.add_weight(graph_dedh_dhai, -0.2) + | pynutil.add_weight(graph_savva, -0.1) + | pynutil.add_weight(graph_sadhe, -0.1) + | pynutil.add_weight(graph_paune, -0.2) + ) + + self.graph = weighted_graph + graph = self.graph - final_graph = self.add_tokens(graph) - self.fst = final_graph.optimize() + graph = self.add_tokens(graph) + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 9f1ffbd39..04d509559 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -15,10 +15,40 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.hi.graph_utils import ( + ASTERISK, + COMMA, + HI_BY, + HI_DECIMAL_25, + HI_DECIMAL_75, + HI_DEDH, + HI_DHAI, + HI_ONE_POINT_FIVE, + HI_PAUNE, + HI_PERIOD, + HI_POINT_FIVE, + HI_SADHE, + HI_SAVVA, + HI_TWO_POINT_FIVE, + HYPHEN, + INPUT_LOWER_CASED, + LOWERCASE_X, + NEMO_CHAR, + NEMO_DIGIT, + NEMO_HI_DIGIT, + NEMO_NOT_SPACE, + NEMO_SPACE, + NEMO_WHITE_SPACE, + PERIOD, + SLASH, + UPPERCASE_X, + GraphFst, + capitalized_input_graph, + delete_space, + insert_space, +) from nemo_text_processing.text_normalization.hi.utils import get_abs_path - digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) teens_and_ties = pynutil.add_weight(teens_ties, -0.1) @@ -29,6 +59,7 @@ class MeasureFst(GraphFst): Finite state transducer for classifying measure, suppletive aware, e.g. -१२kg -> measure { negative: "true" cardinal { integer: "बारह" } units: "किलोग्राम" } -१२.२kg -> measure { decimal { negative: "true" integer_part: "बारह" fractional_part: "दो"} units: "किलोग्राम" } + मुंबई ८८४४०४ -> measure { units: "address" cardinal { integer: "मुंबई आठ आठ चार चार शून्य चार" } preserve_order: true } Args: cardinal: CardinalFst @@ -37,12 +68,145 @@ class MeasureFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst): + def get_structured_address_graph(self, ordinal: GraphFst, input_case: str): + """ + Minimal address tagger for state/city + pincode patterns only. + Highly optimized for performance. + + Examples: + "मुंबई ८८४४०४" -> "मुंबई आठ आठ चार चार शून्य चार" + "गोवा १२३४५६" -> "गोवा एक दो तीन चार पाँच छह" + """ + # State/city keywords + states = pynini.string_file(get_abs_path("data/address/states.tsv")) + cities = pynini.string_file(get_abs_path("data/address/cities.tsv")) + state_city_names = pynini.union(states, cities).optimize() + + # Digit mappings + num_token = ( + digit + | pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + | pynini.string_file(get_abs_path("data/telephone/number.tsv")) + ).optimize() + + # Pincode (6 digits) + pincode = (num_token + pynini.closure(insert_space + num_token, 5, 5)).optimize() + + # Street number (1-4 digits) + street_num = (num_token + pynini.closure(insert_space + num_token, 0, 3)).optimize() + + # Text: words with trailing separator (comma? + space) + any_digit = pynini.union(NEMO_HI_DIGIT, NEMO_DIGIT).optimize() + punctuation = pynini.union(COMMA, PERIOD, HI_PERIOD).optimize() + word_char = pynini.difference(NEMO_NOT_SPACE, pynini.union(any_digit, punctuation)).optimize() + word = pynini.closure(word_char, 1) + + # Separator: optional comma followed by mandatory space + sep = pynini.closure(pynini.accep(COMMA), 0, 1) + pynini.accep(NEMO_SPACE) + word_with_sep = word + sep + text = pynini.closure(word_with_sep, 0, 5).optimize() + + # Pattern: [street_num + sep]? text state/city [space pincode] + pattern = ( + pynini.closure(street_num + sep, 0, 1) + + text + + state_city_names + + pynini.closure(pynini.accep(NEMO_SPACE) + pincode, 0, 1) + ).optimize() + + graph = ( + pynutil.insert('units: "address" cardinal { integer: "') + + pattern + + pynutil.insert('" } preserve_order: true') + ) + return pynutil.add_weight(graph, 1.0).optimize() + + def get_address_graph(self, ordinal: GraphFst, input_case: str): + """ + Address tagger that converts digits/hyphens/slashes character-by-character + when address context keywords are present. + English words and ordinals are converted to Hindi transliterations. + + Examples: + "७०० ओक स्ट्रीट" -> "सात शून्य शून्य ओक स्ट्रीट" + "६६-४ पार्क रोड" -> "छह छह हाइफ़न चार पार्क रोड" + """ + ordinal_graph = ordinal.graph + # Alphanumeric to word mappings (digits, special characters, telephone digits) + char_to_word = ( + digit + | pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + | pynini.string_file(get_abs_path("data/address/special_characters.tsv")) + | pynini.string_file(get_abs_path("data/telephone/number.tsv")) + ).optimize() + # Letter to transliterated word mapping (A -> ए, B -> बी, ...) + letter_to_word = pynini.string_file(get_abs_path("data/address/letters.tsv")) + address_keywords_hi = pynini.string_file(get_abs_path("data/address/context.tsv")) + + # English address keywords with Hindi translation (case-insensitive) + en_to_hi_map = pynini.string_file(get_abs_path("data/address/en_to_hi_mapping.tsv")) + if input_case != INPUT_LOWER_CASED: + en_to_hi_map = capitalized_input_graph(en_to_hi_map) + address_keywords_en = pynini.project(en_to_hi_map, "input") + address_keywords = pynini.union(address_keywords_hi, address_keywords_en) + + # Alphanumeric processing: treat digits, letters, and -/ as convertible tokens + single_digit = pynini.union(NEMO_DIGIT, NEMO_HI_DIGIT).optimize() + special_chars = pynini.union(HYPHEN, SLASH).optimize() + single_letter = pynini.project(letter_to_word, "input").optimize() + convertible_char = pynini.union(single_digit, special_chars, single_letter) + non_space_char = pynini.difference( + NEMO_CHAR, pynini.union(NEMO_WHITE_SPACE, convertible_char, pynini.accep(COMMA)) + ).optimize() + + # Token processors with weights: prefer ordinals and known English→Hindi words + # Delete space before comma to avoid Sparrowhawk "sil" issue + comma_processor = pynutil.add_weight(delete_space + pynini.accep(COMMA), 0.0) + ordinal_processor = pynutil.add_weight(insert_space + ordinal_graph, -5.0) + english_word_processor = pynutil.add_weight(insert_space + en_to_hi_map, -3.0) + letter_processor = pynutil.add_weight(insert_space + pynini.compose(single_letter, letter_to_word), 0.5) + digit_char_processor = pynutil.add_weight(insert_space + pynini.compose(convertible_char, char_to_word), 0.0) + other_word_processor = pynutil.add_weight(insert_space + pynini.closure(non_space_char, 1), 0.1) + + token_processor = ( + ordinal_processor + | english_word_processor + | letter_processor + | digit_char_processor + | pynini.accep(NEMO_SPACE) + | comma_processor + | other_word_processor + ).optimize() + full_string_processor = pynini.closure(token_processor, 1).optimize() + + # Window-based context matching around address keywords for robust detection + word_boundary = pynini.union( + NEMO_WHITE_SPACE, pynini.accep(COMMA), pynini.accep(HI_PERIOD), pynini.accep(PERIOD) + ).optimize() + non_boundary_char = pynini.difference(NEMO_CHAR, word_boundary) + word = pynini.closure(non_boundary_char, 1).optimize() + word_with_boundary = word + pynini.closure(word_boundary) + window = pynini.closure(word_with_boundary, 0, 5).optimize() + boundary = pynini.closure(word_boundary, 1).optimize() + input_pattern = pynini.union( + address_keywords + boundary + window, + window + boundary + address_keywords + pynini.closure(boundary + window, 0, 1), + ).optimize() + address_graph = pynini.compose(input_pattern, full_string_processor).optimize() + graph = ( + pynutil.insert('units: "address" cardinal { integer: "') + + address_graph + + pynutil.insert('" } preserve_order: true') + ) + return pynutil.add_weight(graph, 1.05).optimize() + + def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, input_case: str): super().__init__(name="measure", kind="classify") cardinal_graph = ( - digit - | teens_and_ties + cardinal.zero + | cardinal.digit + | cardinal.teens_and_ties | cardinal.graph_hundreds | cardinal.graph_thousands | cardinal.graph_ten_thousands @@ -53,7 +217,19 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) - quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv")) + + # Year unit variants for formal/informal handling + year_informal = pynini.string_map([("yr", "साल")]) + year_formal = pynini.string_file(get_abs_path("data/measure/unit_year_formal.tsv")) + + # All units EXCEPT year + unit_inputs_except_yr = pynini.difference(pynini.project(unit_graph, "input"), pynini.accep("yr")) + unit_graph_no_year = pynini.compose(unit_inputs_except_yr, unit_graph) + + # Load quarterly units from separate files: map (FST) and list (FSA) + quarterly_units_map = pynini.string_file(get_abs_path("data/measure/quarterly_units_map.tsv")) + quarterly_units_list = pynini.string_file(get_abs_path("data/measure/quarterly_units_list.tsv")) + quarterly_units_graph = pynini.union(quarterly_units_map, quarterly_units_list) optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, @@ -64,23 +240,57 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): # Define the quarterly measurements quarter = pynini.string_map( [ - (".५", "साढ़े"), - ("१.५", "डेढ़"), - ("२.५", "ढाई"), + (HI_POINT_FIVE, HI_SADHE), + (HI_ONE_POINT_FIVE, HI_DEDH), + (HI_TWO_POINT_FIVE, HI_DHAI), ] ) quarter_graph = pynutil.insert("integer_part: \"") + quarter + pynutil.insert("\"") # Define the unit handling - unit = pynutil.insert(" units: \"") + unit_graph + pynutil.insert("\" ") - units = pynutil.insert(" units: \"") + quarterly_units_graph + pynutil.insert("\" ") + unit = ( + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + + unit_graph_no_year + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + units = ( + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + + quarterly_units_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + # Year-specific unit wrappers + unit_year_informal = ( + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + + year_informal + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + unit_year_formal = ( + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + + year_formal + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + # Cardinal >= 1000 -> formal year (वर्ष) + # Use graph_without_leading_zeros which covers all number ranges (thousands to shankhs) + cardinal_large = cardinal.graph_without_leading_zeros + + # Cardinal < 1000 -> informal year (साल) + cardinal_small = cardinal.zero | cardinal.digit | cardinal.teens_and_ties | cardinal.graph_hundreds - # Handling symbols like x, X, * symbol_graph = pynini.string_map( [ - ("x", "बाई"), - ("X", "बाई"), - ("*", "बाई"), + (LOWERCASE_X, HI_BY), + (UPPERCASE_X, HI_BY), + (ASTERISK, HI_BY), ] ) @@ -93,10 +303,71 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + unit ) - graph_quarter = ( + dedh_dhai = pynini.string_map([(HI_ONE_POINT_FIVE, HI_DEDH), (HI_TWO_POINT_FIVE, HI_DHAI)]) + dedh_dhai_graph = pynutil.insert("integer: \"") + dedh_dhai + pynutil.insert("\"") + + savva_numbers = cardinal_graph + pynini.cross(HI_DECIMAL_25, "") + savva_graph = ( + pynutil.insert("integer: \"") + + pynutil.insert(HI_SAVVA) + + pynutil.insert(NEMO_SPACE) + + savva_numbers + + pynutil.insert("\"") + ) + + sadhe_numbers = cardinal_graph + pynini.cross(HI_POINT_FIVE, "") + sadhe_graph = ( + pynutil.insert("integer: \"") + + pynutil.insert(HI_SADHE) + + pynutil.insert(NEMO_SPACE) + + sadhe_numbers + + pynutil.insert("\"") + ) + + paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) + paune_numbers = paune + pynini.cross(HI_DECIMAL_75, "") + paune_graph = ( + pynutil.insert("integer: \"") + + pynutil.insert(HI_PAUNE) + + pynutil.insert(NEMO_SPACE) + + paune_numbers + + pynutil.insert("\"") + ) + + graph_dedh_dhai = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + dedh_dhai_graph + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + + delete_space + + units + ) + + graph_savva = ( pynutil.insert("cardinal { ") + optional_graph_negative - + quarter_graph + + savva_graph + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + + delete_space + + units + ) + + graph_sadhe = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + sadhe_graph + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + + delete_space + + units + ) + + graph_paune = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + paune_graph + pynutil.insert(" }") + delete_space + units @@ -108,11 +379,48 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + pynutil.insert("integer: \"") + cardinal_graph + pynutil.insert("\"") - + pynutil.insert(" }") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + delete_space + unit ) + # Large numbers (>=1000) + yr -> formal (वर्ष) + graph_cardinal_year_formal = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_large + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + + delete_space + + unit_year_formal + ) + + # Small numbers (<1000) + yr -> informal (साल) + graph_cardinal_year_informal = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_small + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + + delete_space + + unit_year_informal + ) + + # Regular decimals (e.g., 16.07) + yr -> formal (वर्ष) + graph_decimal_year_formal = ( + pynutil.insert("decimal { ") + + optional_graph_negative + + decimal_graph + + pynutil.insert(" }") + + delete_space + + unit_year_formal + ) + # Handling cardinal clubbed with symbol as single token graph_exceptions = ( pynutil.insert("cardinal { ") @@ -121,9 +429,11 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + cardinal_graph + pynutil.insert("\"") + pynutil.insert(" }") - + pynutil.insert(" units: \"") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + symbol_graph - + pynutil.insert("\" ") + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + pynutil.insert("} }") + insert_space + pynutil.insert("tokens { cardinal { ") @@ -133,11 +443,22 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + pynutil.insert("\"") ) + address_graph = self.get_address_graph(ordinal, input_case) + structured_address_graph = self.get_structured_address_graph(ordinal, input_case) + graph = ( - pynutil.add_weight(graph_decimal, 0.01) - | pynutil.add_weight(graph_quarter, 0.005) - | pynutil.add_weight(graph_cardinal, 0.01) - | pynutil.add_weight(graph_exceptions, 0.01) + pynutil.add_weight(graph_decimal, 0.1) + | pynutil.add_weight(graph_decimal_year_formal, 0.1) + | pynutil.add_weight(graph_cardinal, 0.1) + | pynutil.add_weight(graph_cardinal_year_formal, 0.1) + | pynutil.add_weight(graph_cardinal_year_informal, -0.1) # Higher priority for small numbers + | pynutil.add_weight(graph_exceptions, 0.1) + | pynutil.add_weight(graph_dedh_dhai, -0.2) + | pynutil.add_weight(graph_savva, -0.1) + | pynutil.add_weight(graph_sadhe, -0.1) + | pynutil.add_weight(graph_paune, -0.5) + | address_graph + | structured_address_graph ) self.graph = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/ordinal.py b/nemo_text_processing/text_normalization/hi/taggers/ordinal.py new file mode 100644 index 000000000..b07c31392 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/ordinal.py @@ -0,0 +1,65 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_CHAR, GraphFst +from nemo_text_processing.text_normalization.hi.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for classifying Hindi ordinals, e.g. + १०वां -> ordinal { integer: "दसवां" } + २१वीं -> ordinal { integer: "इक्कीसवीं" } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, cardinal: CardinalFst, deterministic: bool = True): + super().__init__(name="ordinal", kind="classify", deterministic=deterministic) + + suffixes_list = pynini.string_file(get_abs_path("data/ordinal/suffixes.tsv")) + suffixes_map = pynini.string_file(get_abs_path("data/ordinal/suffixes_map.tsv")) + suffixes_fst = pynini.union(suffixes_list, suffixes_map) + exceptions = pynini.string_file(get_abs_path("data/ordinal/exceptions.tsv")) + + en_to_hi_digits = pynini.string_file(get_abs_path("data/ordinal/en_to_hi_digit.tsv")) + digit_normalizer = pynini.cdrewrite(en_to_hi_digits, "", "", pynini.closure(NEMO_CHAR)) + + # Limit cardinal graph to thousands range for faster compilation + limited_cardinal_graph = ( + cardinal.digit + | cardinal.zero + | cardinal.teens_and_ties + | cardinal.graph_hundreds + | cardinal.graph_thousands + | cardinal.graph_ten_thousands + ).optimize() + + graph = limited_cardinal_graph + suffixes_fst + exceptions = pynutil.add_weight(exceptions, -0.1) + graph = pynini.union(exceptions, graph) + + graph_with_normalization = pynini.compose(digit_normalizer, graph) + self.graph = graph_with_normalization.optimize() + + final_graph = pynutil.insert("integer: \"") + graph_with_normalization + pynutil.insert("\"") + final_graph = self.add_tokens(final_graph) + + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/punctuation.py b/nemo_text_processing/text_normalization/hi/taggers/punctuation.py index 8309ba030..14c9a1a55 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/punctuation.py +++ b/nemo_text_processing/text_normalization/hi/taggers/punctuation.py @@ -36,9 +36,9 @@ def __init__(self, deterministic: bool = True): emphasis = ( pynini.accep("<") - + ( - (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1)) - | (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1)) + + pynini.union( + (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1)), + (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1)), ) + pynini.accep(">") ) diff --git a/nemo_text_processing/text_normalization/hi/taggers/telephone.py b/nemo_text_processing/text_normalization/hi/taggers/telephone.py new file mode 100644 index 000000000..d20870c0d --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/telephone.py @@ -0,0 +1,228 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import ( + NEMO_CHAR, + NEMO_DIGIT, + NEMO_HI_DIGIT, + NEMO_SPACE, + NEMO_WHITE_SPACE, + GraphFst, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + +HI_ZERO_DIGIT = pynini.union("0", "०") +HI_MOBILE_START_DIGITS = pynini.union("६", "७", "८", "९", "6", "7", "8", "9").optimize() +HI_LANDLINE_START_DIGITS = pynini.union("२", "३", "४", "६", "2", "3", "4", "6").optimize() + +delete_zero = pynutil.delete(HI_ZERO_DIGIT) +delete_zero_optional = pynini.closure(delete_zero, 0, 1) +insert_shunya = pynutil.insert('शून्य') + insert_space + +# Load the number mappings from the TSV file +digit_to_word = pynini.string_file(get_abs_path("data/telephone/number.tsv")) +digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) +zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) +mobile_context = pynini.string_file(get_abs_path("data/telephone/mobile_context.tsv")) +landline_context = pynini.string_file(get_abs_path("data/telephone/landline_context.tsv")) +credit_context = pynini.string_file(get_abs_path("data/telephone/credit_context.tsv")) +pincode_context = pynini.string_file(get_abs_path("data/telephone/pincode_context.tsv")) + +# Reusable optimized graph for any digit token +num_token = pynini.union(digit_to_word, digits, zero).optimize() + + +def generate_mobile(context_keywords: pynini.Fst) -> pynini.Fst: + context_before, context_after = get_context(context_keywords) + + # Filter cardinals to only include allowed digits + mobile_start_digit = pynini.union(HI_MOBILE_START_DIGITS @ digits, HI_MOBILE_START_DIGITS @ digit_to_word) + + country_code_digits = pynini.closure(num_token + insert_space, 1, 3) + country_code = ( + pynutil.insert("country_code: \"") + + context_before + + pynini.cross("+", "प्लस") + + insert_space + + country_code_digits + + pynutil.insert("\" ") + + pynini.closure(delete_space, 0, 1) + ) + + extension_optional = pynini.closure( + pynutil.insert("extension: \"") + + pynini.closure(num_token + insert_space, 1, 3) + + context_after + + pynutil.insert("\" ") + + delete_space, + 0, + 1, + ) + + number_part = mobile_start_digit + insert_space + pynini.closure(num_token + insert_space, 9) + + number_without_country = ( + pynutil.insert("number_part: \"") + + context_before + + delete_zero_optional + + insert_shunya + + number_part + + context_after + + pynutil.insert("\" ") + + delete_space + ) + + number_with_country = ( + country_code + + pynutil.insert("number_part: \"") + + number_part + + context_after + + pynutil.insert("\" ") + + delete_space + ) + + return (pynini.union(number_with_country, number_without_country) + extension_optional).optimize() + + +def get_landline(std_length: int, context_keywords: pynini.Fst) -> pynini.Fst: + context_before, context_after = get_context(context_keywords) + + # Filter cardinals to only include allowed digits + landline_start_digit = pynini.union(HI_LANDLINE_START_DIGITS @ digits, HI_LANDLINE_START_DIGITS @ digit_to_word) + + std_code_graph = ( + delete_zero_optional + insert_shunya + pynini.closure(num_token + insert_space, std_length, std_length) + ) + + landline_digit_count = 9 - std_length + landline_graph = ( + landline_start_digit + + insert_space + + pynini.closure(num_token + insert_space, landline_digit_count, landline_digit_count) + ) + + separator_optional = pynini.closure(pynini.union(pynini.cross("-", ""), pynini.cross(".", "")), 0, 1) + + std_code_in_brackets = ( + delete_zero_optional + + delete_space + + pynutil.delete("(") + + pynini.closure(delete_space, 0, 1) + + std_code_graph + + pynini.closure(delete_space, 0, 1) + + pynutil.delete(")") + ) + + std_part = pynini.union(std_code_graph, std_code_in_brackets) + + return ( + pynutil.insert("number_part: \"") + + context_before + + std_part + + separator_optional + + delete_space + + landline_graph + + context_after + + pynutil.insert("\" ") + ).optimize() + + +def generate_landline(context_keywords: pynini.Fst) -> pynini.Fst: + graph = ( + get_landline(2, context_keywords) + | get_landline(3, context_keywords) + | get_landline(4, context_keywords) + | get_landline(5, context_keywords) + | get_landline(6, context_keywords) + | get_landline(7, context_keywords) + ) + + return graph.optimize() + + +def get_context(keywords: pynini.Fst): + + all_digits = pynini.union(NEMO_HI_DIGIT, NEMO_DIGIT) + + non_digit_char = pynini.difference(NEMO_CHAR, pynini.union(all_digits, NEMO_WHITE_SPACE)) + word = pynini.closure(non_digit_char, 1) + pynini.accep(NEMO_SPACE) + + window = pynini.closure(word, 0, 5) + + before = pynini.closure(keywords + pynini.accep(NEMO_SPACE) + window, 0, 1) + + after = pynini.closure(pynutil.delete(NEMO_SPACE) + window + keywords, 0, 1) + + return before.optimize(), after.optimize() + + +def generate_credit(context_keywords: pynini.Fst) -> pynini.Fst: + context_before, context_after = get_context(context_keywords) + return ( + pynutil.insert("number_part: \"") + + context_before + + pynini.closure(num_token + insert_space, 4) + + context_after + + pynutil.insert("\" ") + + delete_space + ).optimize() + + +def generate_pincode(context_keywords: pynini.Fst) -> pynini.Fst: + context_before, context_after = get_context(context_keywords) + return ( + pynutil.insert("number_part: \"") + + context_before + + pynini.closure(num_token + insert_space, 6) + + context_after + + pynutil.insert("\" ") + + delete_space + ).optimize() + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for tagging telephone numbers, e.g. + ९१५७११४००७ -> telephone { number_part: "शून्य नौ एक पाँच सात एक एक चार शून्य शून्य सात" } + +९१ ९२१०५१५६०६ -> telephone { country_code: "प्लस नौ एक", number_part: "नौ दो एक शून्य पाँच एक पाँच छह शून्य छह" } + १३७४-३०९९८८ -> telephone { number_part: "शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ" } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization + """ + + def __init__(self): + super().__init__(name="telephone", kind="classify") + + mobile_number = generate_mobile(mobile_context) + landline = generate_landline(landline_context) + credit_card = generate_credit(credit_context) + pincode = generate_pincode(pincode_context) + + graph = ( + pynutil.add_weight(mobile_number, 0.7) + | pynutil.add_weight(landline, 0.8) + | pynutil.add_weight(credit_card, 0.9) + | pynutil.add_weight(pincode, 1) + ) + + self.final = graph.optimize() + self.fst = self.add_tokens(self.final) diff --git a/nemo_text_processing/text_normalization/hi/taggers/time.py b/nemo_text_processing/text_normalization/hi/taggers/time.py index 6c87c9aad..09defaab2 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/time.py +++ b/nemo_text_processing/text_normalization/hi/taggers/time.py @@ -15,9 +15,24 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, insert_space +from nemo_text_processing.text_normalization.hi.graph_utils import ( + HI_DEDH, + HI_DHAI, + HI_PAUNE, + HI_SADHE, + HI_SAVVA, + NEMO_SPACE, + GraphFst, + insert_space, +) from nemo_text_processing.text_normalization.hi.utils import get_abs_path +# Time patterns specific to time tagger +HI_DOUBLE_ZERO = "००" +HI_TIME_FIFTEEN = ":१५" # :15 +HI_TIME_THIRTY = ":३०" # :30 +HI_TIME_FORTYFIVE = ":४५" # :45 + hours_graph = pynini.string_file(get_abs_path("data/time/hours.tsv")) minutes_graph = pynini.string_file(get_abs_path("data/time/minutes.tsv")) seconds_graph = pynini.string_file(get_abs_path("data/time/seconds.tsv")) @@ -36,10 +51,11 @@ class TimeFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self): + def __init__(self, cardinal: GraphFst): super().__init__(name="time", kind="classify") delete_colon = pynutil.delete(":") + cardinal_graph = cardinal.digit | cardinal.teens_and_ties self.hours = pynutil.insert("hours: \"") + hours_graph + pynutil.insert("\" ") self.minutes = pynutil.insert("minutes: \"") + minutes_graph + pynutil.insert("\" ") @@ -54,9 +70,57 @@ def __init__(self): graph_hm = self.hours + delete_colon + insert_space + self.minutes # hour - graph_h = self.hours + delete_colon + pynutil.delete("००") + graph_h = self.hours + delete_colon + pynutil.delete(HI_DOUBLE_ZERO) + + dedh_dhai_graph = pynini.string_map([("१" + HI_TIME_THIRTY, HI_DEDH), ("२" + HI_TIME_THIRTY, HI_DHAI)]) + + savva_numbers = cardinal_graph + pynini.cross(HI_TIME_FIFTEEN, "") + savva_graph = pynutil.insert(HI_SAVVA) + pynutil.insert(NEMO_SPACE) + savva_numbers + + sadhe_numbers = cardinal_graph + pynini.cross(HI_TIME_THIRTY, "") + sadhe_graph = pynutil.insert(HI_SADHE) + pynutil.insert(NEMO_SPACE) + sadhe_numbers + + paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv")) + paune_numbers = paune + pynini.cross(HI_TIME_FORTYFIVE, "") + paune_graph = pynutil.insert(HI_PAUNE) + pynutil.insert(NEMO_SPACE) + paune_numbers + + graph_dedh_dhai = ( + pynutil.insert("morphosyntactic_features: \"") + + dedh_dhai_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) - final_graph = graph_hms | graph_hm | graph_h + graph_savva = ( + pynutil.insert("morphosyntactic_features: \"") + + savva_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + graph_sadhe = ( + pynutil.insert("morphosyntactic_features: \"") + + sadhe_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + graph_paune = ( + pynutil.insert("morphosyntactic_features: \"") + + paune_graph + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + final_graph = ( + graph_hms + | pynutil.add_weight(graph_hm, 0.3) + | pynutil.add_weight(graph_h, 0.3) + | pynutil.add_weight(graph_dedh_dhai, 0.1) + | pynutil.add_weight(graph_savva, 0.2) + | pynutil.add_weight(graph_sadhe, 0.2) + | pynutil.add_weight(graph_paune, 0.1) + ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index b1bbd2a10..cb03ebce6 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -14,12 +14,12 @@ import logging import os -import time import pynini from pynini.lib import pynutil from nemo_text_processing.text_normalization.hi.graph_utils import ( + NEMO_SPACE, NEMO_WHITE_SPACE, GraphFst, delete_extra_space, @@ -32,7 +32,9 @@ from nemo_text_processing.text_normalization.hi.taggers.fraction import FractionFst from nemo_text_processing.text_normalization.hi.taggers.measure import MeasureFst from nemo_text_processing.text_normalization.hi.taggers.money import MoneyFst +from nemo_text_processing.text_normalization.hi.taggers.ordinal import OrdinalFst from nemo_text_processing.text_normalization.hi.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.hi.taggers.telephone import TelephoneFst from nemo_text_processing.text_normalization.hi.taggers.time import TimeFst from nemo_text_processing.text_normalization.hi.taggers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.hi.taggers.word import WordFst @@ -77,51 +79,39 @@ def __init__( else: logging.info(f"Creating ClassifyFst grammars.") - start_time = time.time() cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = cardinal.fst - logging.debug(f"cardinal: {time.time() - start_time: .2f}s -- {cardinal_graph.num_states()} nodes") - start_time = time.time() decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) decimal_graph = decimal.fst - logging.debug(f"decimal: {time.time() - start_time: .2f}s -- {decimal_graph.num_states()} nodes") - start_time = time.time() fraction = FractionFst(cardinal=cardinal, deterministic=deterministic) fraction_graph = fraction.fst - logging.debug(f"fraction: {time.time() - start_time: .2f}s -- {fraction_graph.num_states()} nodes") - start_time = time.time() date = DateFst(cardinal=cardinal) date_graph = date.fst - logging.debug(f"date: {time.time() - start_time: .2f}s -- {date_graph.num_states()} nodes") - start_time = time.time() - timefst = TimeFst() + timefst = TimeFst(cardinal=cardinal) time_graph = timefst.fst - logging.debug(f"time: {time.time() - start_time: .2f}s -- {time_graph.num_states()} nodes") - start_time = time.time() - measure = MeasureFst(cardinal=cardinal, decimal=decimal) + ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) + ordinal_graph = ordinal.fst + + measure = MeasureFst(cardinal=cardinal, decimal=decimal, ordinal=ordinal, input_case=input_case) measure_graph = measure.fst - logging.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") - start_time = time.time() money = MoneyFst(cardinal=cardinal) money_graph = money.fst - logging.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes") - start_time = time.time() whitelist_graph = WhiteListFst( input_case=input_case, deterministic=deterministic, input_file=whitelist ).fst - logging.debug(f"whitelist: {time.time() - start_time: .2f}s -- {whitelist_graph.num_states()} nodes") - start_time = time.time() punctuation = PunctuationFst(deterministic=deterministic) punct_graph = punctuation.fst - logging.debug(f"punct: {time.time() - start_time: .2f}s -- {punct_graph.num_states()} nodes") + + telephone = TelephoneFst() + telephone_graph = telephone.fst classify = ( pynutil.add_weight(whitelist_graph, 1.01) @@ -132,35 +122,39 @@ def __init__( | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) + | pynutil.add_weight(telephone_graph, 1.1) + | pynutil.add_weight(ordinal_graph, 1.1) ) - start_time = time.time() word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).fst - logging.debug(f"word: {time.time() - start_time: .2f}s -- {word_graph.num_states()} nodes") punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") punct = pynini.closure( - pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) - | (pynutil.insert(" ") + punct), + pynini.union( + pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space), + (pynutil.insert(NEMO_SPACE) + punct), + ), 1, ) - classify |= pynutil.add_weight(word_graph, 100) + classify = pynini.union(classify, pynutil.add_weight(word_graph, 100)) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( - pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + pynini.closure(punct + pynutil.insert(NEMO_SPACE)) + + token + + pynini.closure(pynutil.insert(NEMO_SPACE) + punct) ) graph = token_plus_punct + pynini.closure( - ( - pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) - | (pynutil.insert(" ") + punct + pynutil.insert(" ")) + pynini.union( + pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space), + (pynutil.insert(NEMO_SPACE) + punct + pynutil.insert(NEMO_SPACE)), ) + token_plus_punct ) graph = delete_space + graph + delete_space - graph |= punct + graph = pynini.union(graph, punct) self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/word.py b/nemo_text_processing/text_normalization/hi/taggers/word.py index bc354232b..00feb1827 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/word.py +++ b/nemo_text_processing/text_normalization/hi/taggers/word.py @@ -40,10 +40,9 @@ def __init__(self, punctuation: PunctuationFst, deterministic: bool = True): # Define Hindi characters and symbols using pynini.union HINDI_CHAR = pynini.union( - *[chr(i) for i in range(ord("ऀ"), ord("ः") + 1)], # Hindi vowels and consonants - *[chr(i) for i in range(ord("अ"), ord("ह") + 1)], # More Hindi characters - *[chr(i) for i in range(ord("ा"), ord("्") + 1)], # Hindi diacritics - *[chr(i) for i in range(ord("०"), ord("९") + 1)], # Hindi digits + *[chr(i) for i in range(0x0900, 0x0903 + 1)], # Hindi vowels and consonants + *[chr(i) for i in range(0x0905, 0x0939 + 1)], # More Hindi characters + *[chr(i) for i in range(0x093E, 0x094D + 1)], # Hindi diacritics ).optimize() # Include punctuation in the graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py index 7e3b33b7c..a07c41eae 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py @@ -40,6 +40,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): denominator = pynutil.delete("denominator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") insert_bata = pynutil.insert(" बटा ") insert_aur = pynutil.insert(" और ") + graph_quarter = ( + pynutil.delete("morphosyntactic_features: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) fraction_default = numerator + insert_bata + denominator @@ -47,7 +50,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): optional_sign + pynini.closure(pynini.closure(integer, 0, 1) + insert_space + insert_aur) + fraction_default - ) + ) | graph_quarter graph = self.graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/measure.py b/nemo_text_processing/text_normalization/hi/verbalizers/measure.py index d6d17ac37..cba08057d 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/measure.py @@ -27,7 +27,7 @@ class MeasureFst(GraphFst): Args: decimal: DecimalFst - cardinal: CardinalFs + cardinal: CardinalFst deterministic: if True will provide a single transduction option, for False multiple transduction are generated (used for audio-based normalization) """ @@ -41,7 +41,12 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): 1, ) - unit = pynutil.delete("units: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + delete_space + unit = ( + pynutil.delete("units: \"") + + pynini.difference(pynini.closure(NEMO_NOT_QUOTE, 1), pynini.accep("address")) + + pynutil.delete("\"") + + delete_space + ) graph_decimal = ( pynutil.delete("decimal {") @@ -64,6 +69,18 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) graph = (graph_cardinal | graph_decimal) + delete_space + insert_space + unit + + preserve_order = pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space + address = ( + pynutil.delete("units: \"address\" ") + + delete_space + + graph_cardinal + + delete_space + + pynini.closure(preserve_order) + ) + + graph |= address + self.decimal = graph_decimal delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/hi/verbalizers/ordinal.py new file mode 100644 index 000000000..ab88603f6 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/ordinal.py @@ -0,0 +1,38 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for verbalizing Hindi ordinals, e.g. + ordinal { integer: "दसवां" } -> दसवां + ordinal { integer: "इक्कीसवीं" } -> इक्कीसवीं + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) + + integer_value = delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + graph = pynutil.delete("integer:") + integer_value + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/hi/verbalizers/post_processing.py index d838ca6ff..595180241 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/post_processing.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/post_processing.py @@ -16,13 +16,15 @@ import os import pynini +from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import ( - NEMO_NOT_SPACE, +from nemo_text_processing.text_normalization.hi.graph_utils import ( + MIN_NEG_WEIGHT, + NEMO_CHAR, NEMO_SIGMA, - delete_space, generator_main, ) +from nemo_text_processing.text_normalization.hi.taggers.punctuation import PunctuationFst from nemo_text_processing.utils.logging import logger @@ -46,68 +48,49 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): self.fst = pynini.Far(far_file, mode="r")["post_process_graph"] logger.info(f'Post processing graph was restored from {far_file}.') else: - self.set_punct_dict() self.fst = self.get_punct_postprocess_graph() if far_file: generator_main(far_file, {"post_process_graph": self.fst}) - def set_punct_dict(self): - self.punct_marks = { - "'": [ - "'", - '´', - 'ʹ', - 'ʻ', - 'ʼ', - 'ʽ', - 'ʾ', - 'ˈ', - 'ˊ', - 'ˋ', - '˴', - 'ʹ', - '΄', - '՚', - '՝', - 'י', - '׳', - 'ߴ', - 'ߵ', - 'ᑊ', - 'ᛌ', - '᾽', - '᾿', - '`', - '´', - '῾', - '‘', - '’', - '‛', - '′', - '‵', - 'ꞌ', - ''', - '`', - '𖽑', - '𖽒', - ], - } - def get_punct_postprocess_graph(self): """ Returns graph to post process punctuation marks. - {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept. - By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks. + By default, spaces are removed before punctuation marks like comma, period, etc. """ - - remove_space_around_single_quote = pynini.cdrewrite( - delete_space, NEMO_NOT_SPACE, NEMO_NOT_SPACE, pynini.closure(NEMO_SIGMA) + punct_marks_all = PunctuationFst().punct_marks + + # Punctuation marks that should NOT have space before them + # (most punctuation except quotes, dashes, and opening brackets) + quotes = ["'", "\"", "«"] + dashes = ["-", "—"] + brackets = ["<", "{", "(", r"\["] + allow_space_before_punct = quotes + dashes + brackets + + no_space_before_punct = [m for m in punct_marks_all if m not in allow_space_before_punct] + # Add Hindi-specific punctuation + no_space_before_punct.extend(["।", ",", ".", ";", ":", "!", "?"]) + # Remove duplicates + no_space_before_punct = list(set(no_space_before_punct)) + no_space_before_punct = pynini.union(*no_space_before_punct) + + delete_space = pynutil.delete(" ") + + # Delete space before no_space_before_punct marks + non_punct = pynini.difference(NEMO_CHAR, no_space_before_punct).optimize() + graph = ( + pynini.closure(non_punct) + + pynini.closure( + no_space_before_punct | pynutil.add_weight(delete_space + no_space_before_punct, MIN_NEG_WEIGHT) + ) + + pynini.closure(non_punct) ) - # this works if spaces in between (good) - # delete space between 2 NEMO_NOT_SPACE(left and right to the space) that are with in a content of NEMO_SIGMA + graph = pynini.closure(graph).optimize() - graph = remove_space_around_single_quote.optimize() + # Remove space after opening brackets + no_space_after_punct = pynini.union(*brackets) + no_space_after_punct = pynini.cdrewrite(delete_space, no_space_after_punct, NEMO_SIGMA, NEMO_SIGMA).optimize() + graph = pynini.compose(graph, no_space_after_punct).optimize() return graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py new file mode 100644 index 000000000..55ebeab01 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/verbalizers/telephone.py @@ -0,0 +1,72 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import ( + MIN_NEG_WEIGHT, + NEMO_NOT_QUOTE, + NEMO_SPACE, + GraphFst, + delete_space, + insert_space, +) + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for verbalizing telephone numbers, e.g. + telephone { country_code: "प्लस नौ एक", number_part: "नौ दो एक शून्य पाँच एक पाँच छह शून्य छह" } -> प्लस नौ एक नौ दो एक शून्य पाँच एक पाँच छह शून्य छह + telephone { number_part: "शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ" } -> शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="telephone", kind="verbalize", deterministic=deterministic) + + optional_country_code = pynini.closure( + pynutil.delete("country_code: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + + delete_space + + insert_space, + 0, + 1, + ) + + number_part = ( + pynutil.delete("number_part: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynini.closure(pynutil.add_weight(pynutil.delete(NEMO_SPACE), MIN_NEG_WEIGHT), 0, 1) + + pynutil.delete("\"") + ) + + optional_extension = pynini.closure( + delete_space + + insert_space + + pynutil.delete("extension: \"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\""), + 0, + 1, + ) + + graph = optional_country_code + number_part + optional_extension + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/time.py b/nemo_text_processing/text_normalization/hi/verbalizers/time.py index da10df4a0..df232e3cd 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/time.py @@ -30,7 +30,7 @@ class TimeFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self): + def __init__(self, cardinal: GraphFst): super().__init__(name="time", kind="verbalize") hour = pynutil.delete("hours: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + insert_space @@ -63,13 +63,17 @@ def __init__(self): + insert_second ) + graph_quarter = ( + pynutil.delete("morphosyntactic_features: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + ) + # hour minute graph_hm = hour + delete_space + insert_bajkar + insert_space + minute + delete_space + insert_minute # hour graph_h = hour + delete_space + insert_baje - self.graph = graph_hms | graph_hm | graph_h + self.graph = graph_hms | graph_hm | graph_h | graph_quarter final_graph = self.graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py index e91f0d9f6..30d076c93 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py @@ -19,6 +19,8 @@ from nemo_text_processing.text_normalization.hi.verbalizers.fraction import FractionFst from nemo_text_processing.text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.hi.verbalizers.money import MoneyFst +from nemo_text_processing.text_normalization.hi.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.hi.verbalizers.telephone import TelephoneFst from nemo_text_processing.text_normalization.hi.verbalizers.time import TimeFst from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst @@ -49,15 +51,21 @@ def __init__(self, deterministic: bool = True): date = DateFst() date_graph = date.fst - time = TimeFst() + time = TimeFst(cardinal=cardinal) time_graph = time.fst + ordinal = OrdinalFst(deterministic=deterministic) + ordinal_graph = ordinal.fst + measure = MeasureFst(cardinal=cardinal, decimal=decimal) measure_graph = measure.fst money = MoneyFst() money_graph = money.fst + telephone = TelephoneFst() + telephone_graph = telephone.fst + whitelist_graph = WhiteListFst(deterministic=deterministic).fst graph = ( @@ -68,7 +76,9 @@ def __init__(self, deterministic: bool = True): | time_graph | measure_graph | money_graph + | ordinal_graph | whitelist_graph + | telephone_graph ) self.fst = graph diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 82f8f43d2..73263f454 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -161,7 +161,11 @@ def __init__( from nemo_text_processing.text_normalization.ar.verbalizers.verbalize_final import VerbalizeFinalFst elif lang == 'hi': from nemo_text_processing.text_normalization.hi.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.text_normalization.hi.verbalizers.post_processing import PostProcessingFst from nemo_text_processing.text_normalization.hi.verbalizers.verbalize_final import VerbalizeFinalFst + + if post_process: + self.post_processor = PostProcessingFst(cache_dir=cache_dir, overwrite_cache=overwrite_cache) elif lang == 'it': from nemo_text_processing.text_normalization.it.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.it.verbalizers.verbalize_final import VerbalizeFinalFst @@ -374,7 +378,7 @@ def normalize( return text output = SPACE_DUP.sub(' ', output[1:]) - if self.lang == "en" and hasattr(self, 'post_processor'): + if self.lang in ["en", "hi"] and hasattr(self, 'post_processor') and self.post_processor is not None: output = self.post_process(output) if punct_post_process: diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_address.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_address.txt new file mode 100644 index 000000000..788a8efdc --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_address.txt @@ -0,0 +1,55 @@ +७०० ओक स्ट्रीट~सात शून्य शून्य ओक स्ट्रीट +११ जंगल रोड~एक एक जंगल रोड +३०१ पार्क एवेन्यू~तीन शून्य एक पार्क एवेन्यू +गली नंबर १७ जीएकगढ़~गली नंबर एक सात जीएकगढ़ +अदनान अपार्टमेंट फ्लैट नंबर ५५~अदनान अपार्टमेंट फ्लैट नंबर पाँच पाँच +प्लॉट नंबर ८ बालाजी मार्केट~प्लॉट नंबर आठ बालाजी मार्केट +शॉप नंबर १०९ ९ और १० डिवाइडिंग रोड सेक्टर १० फरीदाबाद~शॉप नंबर एक शून्य नौ नौ और एक शून्य डिवाइडिंग रोड सेक्टर एक शून्य फरीदाबाद +बूथ ७०, सेक्टर ८, चंडीगढ़~बूथ सात शून्य, सेक्टर आठ, चंडीगढ़ +२२२१ Southern Street~दो दो दो एक सदर्न स्ट्रीट +७०० ओक स्ट्रीट~सात शून्य शून्य ओक स्ट्रीट +६२५ स्कूल स्ट्रीट~छह दो पाँच स्कूल स्ट्रीट +१४७० एस वाशिंगटन स्ट्रीट~एक चार सात शून्य एस वाशिंगटन स्ट्रीट +५०६ स्टेट रोड~पाँच शून्य छह स्टेट रोड +६६-४ पार्कहर्स्ट आर डी~छह छह हाइफ़न चार पार्कहर्स्ट आर डी +५७९ ट्रॉय-शेंक्टाडी रोड~पाँच सात नौ ट्रॉय हाइफ़न शेंक्टाडी रोड +७८३० - ई वेटरन्स पार्कवे, कोलंबस, जी ए ३१९०९~सात आठ तीन शून्य हाइफ़न ई वेटरन्स पार्कवे, कोलंबस, जी ए तीन एक नौ शून्य नौ +६६-४, पार्कहर्स्ट रोड~छह छह हाइफ़न चार, पार्कहर्स्ट रोड +८४०/१, १०० फीट रोड, मेट्रो पिलर ५६-५७, इंदिरानगर, बैंगलोर~आठ चार शून्य बटा एक, एक शून्य शून्य फीट रोड, मेट्रो पिलर पाँच छह हाइफ़न पाँच सात, इंदिरानगर, बैंगलोर +१७-१८, राजलक्ष्मी नगर, ७th क्रॉस स्ट्रीट, १०० फीट बाईपास रोड, वेलाचेरी, चेन्नई~एक सात हाइफ़न एक आठ, राजलक्ष्मी नगर, सेवंथ क्रॉस स्ट्रीट, एक शून्य शून्य फीट बाईपास रोड, वेलाचेरी, चेन्नई +४/५ न्यू म्युनिसिपल मार्केट रोड नंबर ५ और ६ सेन्टाक्रूज़ वेस्ट~चार बटा पाँच न्यू म्युनिसिपल मार्केट रोड नंबर पाँच और छह सेन्टाक्रूज़ वेस्ट +१६/१७ ४th फ्लोर जवाहर नगर मटरू मंदिर रोड नंबर २~एक छह बटा एक सात फ़ोर्थ फ्लोर जवाहर नगर मटरू मंदिर रोड नंबर दो +५/३०४ सिक्का कॉम्प्लेक्स विकास मार्ग एक्सटेंशन~पाँच बटा तीन शून्य चार सिक्का कॉम्प्लेक्स विकास मार्ग एक्सटेंशन +२१/२ २nd फ्लोर १st मेन रोड गांधी नगर~दो एक बटा दो सेकंड फ्लोर फ़र्स्ट मेन रोड गांधी नगर +नंबर २२/१८ ३rd फ्लोर सराय बोउ अली शू मार्केट~नंबर दो दो बटा एक आठ थर्ड फ्लोर सराय बोउ अली शू मार्केट +१४/३, मथुरा रोड~एक चार बटा तीन, मथुरा रोड +यूनिट ३ १st फ्लोर नंबर ३७ सोलेमान खतर स्ट्रीट~यूनिट तीन फ़र्स्ट फ्लोर नंबर तीन सात सोलेमान खतर स्ट्रीट +१st फ्लोर नंबर ५२ नॉर्थ अबूज़र स्ट्रीट खान ए अंसारी स्ट्रीट शरीयती स्ट्रीट १६६१७~फ़र्स्ट फ्लोर नंबर पाँच दो नॉर्थ अबूज़र स्ट्रीट खान ए अंसारी स्ट्रीट शरीयती स्ट्रीट एक छह छह एक सात +२०६ जय कॉम कॉम्प्लेक्स १st पोखरन रोड~दो शून्य छह जय कॉम कॉम्प्लेक्स फ़र्स्ट पोखरन रोड +नंबर ३६ २nd फ्लोर सुपर ८ फेज १ एकबतन टाउन तेहरान १३९४७~नंबर तीन छह सेकंड फ्लोर सुपर आठ फेज एक एकबतन टाउन तेहरान एक तीन नौ चार सात +२nd फ्लोर नंबर ८०८ आजादी स्ट्रीट~सेकंड फ्लोर नंबर आठ शून्य आठ आजादी स्ट्रीट +२nd फ्लोर नंबर १५ बिफ़ोर कांदि स्ट्रीट नॉर्थ सोहरावर्दी स्ट्रीट १५६६९~सेकंड फ्लोर नंबर एक पाँच बिफ़ोर कांदि स्ट्रीट नॉर्थ सोहरावर्दी स्ट्रीट एक पाँच छह छह नौ +यूनिट ४ नंबर २५ २nd गोलहा स्ट्रीट काशनी स्ट्रीट नूर स्क्वेर~यूनिट चार नंबर दो पाँच सेकंड गोलहा स्ट्रीट काशनी स्ट्रीट नूर स्क्वेर +ईस्ट ३rd फ्लोर नंबर ७० नेक्स्ट दो तोहीद इंस्टीट्यूट परचम स्ट्रीट~ईस्ट थर्ड फ्लोर नंबर सात शून्य नेक्स्ट दो तोहीद इंस्टीट्यूट परचम स्ट्रीट +३rd फ्लोर नंबर ५ हमेदन एली अपोज़िट लाले पार्क नॉर्थ कारगर स्ट्रीट~थर्ड फ्लोर नंबर पाँच हमेदन एली अपोज़िट लाले पार्क नॉर्थ कारगर स्ट्रीट +४th फ्लोर नंबर ११२४ जमहोरी स्ट्रीट~फ़ोर्थ फ्लोर नंबर एक एक दो चार जमहोरी स्ट्रीट +५th फ्लोर नंबर ७/१ १३th एली शाहिद अराबली स्ट्रीट~फ़िफ्थ फ्लोर नंबर सात बटा एक थर्टींथ एली शाहिद अराबली स्ट्रीट +११, ८० फीट रोड, इंडियन ऑयल पेट्रोल पंप, कोरमंगला ६th ब्लॉक, बैंगलोर के सामने~एक एक, आठ शून्य फीट रोड, इंडियन ऑयल पेट्रोल पंप, कोरमंगला सिक्स्थ ब्लॉक, बैंगलोर के सामने +२१/११, जे ब्लॉक, ६th एवेन्यू मेन रोड, अन्ना नगर पूर्व, चेन्नई~दो एक बटा एक एक, जे ब्लॉक, सिक्स्थ एवेन्यू मेन रोड, अन्ना नगर पूर्व, चेन्नई +३२A नाज़ प्लाज़ा मेरिस रोड~तीन दो ए नाज़ प्लाज़ा मेरिस रोड +२१४ बी गोविंद पूरी स्ट्रीट नंबर २~दो एक चार बी गोविंद पूरी स्ट्रीट नंबर दो +४३६२ १६वीं एवेन्यू एसडब्ल्यू, देवदार रैपिड्स, आई ए ५२४०४~चार तीन छह दो सोलहवीं एवेन्यू एसडब्ल्यू, देवदार रैपिड्स, आई ए बावन हज़ार चार सौ चार +अमरावती ६५५९३०~अमरावती छह पाँच पाँच नौ तीन शून्य +शिमला, हिमाचल प्रदेश ५९३९८८~शिमला, हिमाचल प्रदेश पाँच नौ तीन नौ आठ आठ +२७०४४० डॉसन आर डी, अल्बानी, जीए ३१७०७~दो सात शून्य चार चार शून्य डॉसन आर डी, अल्बानी, जीए तीन एक सात शून्य सात +रांची, झारखंड ७३६५५७~रांची, झारखंड सात तीन छह पाँच पाँच सात +कोहिमा, नागालैंड ४४८३७७~कोहिमा, नागालैंड चार चार आठ तीन सात सात +मुंबई, महाराष्ट्र ८३९४८८~मुंबई, महाराष्ट्र आठ तीन नौ चार आठ आठ +अमरावती ४६८२५२~अमरावती चार छह आठ दो पाँच दो +गांधीनगर, गुजरात ८०८३७४~गांधीनगर, गुजरात आठ शून्य आठ तीन सात चार +मुंबई, महाराष्ट्र २९०९३७~मुंबई, महाराष्ट्र दो नौ शून्य नौ तीन सात +श्रीनगर, जम्मू और कश्मीर ९६४५२३~श्रीनगर, जम्मू और कश्मीर नौ छह चार पाँच दो तीन +रायपुर, छत्तीसगढ़ ११०६३५~रायपुर, छत्तीसगढ़ एक एक शून्य छह तीन पाँच +भोपाल, मध्य प्रदेश ७५१२२५~भोपाल, मध्य प्रदेश सात पाँच एक दो दो पाँच +अगरतला, त्रिपुरा ९१५३०५~अगरतला, त्रिपुरा नौ एक पाँच तीन शून्य पाँच +लखनऊ, उत्तर प्रदेश ८०२४८१~लखनऊ, उत्तर प्रदेश आठ शून्य दो चार आठ एक diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt index 6ba21de69..46f981a88 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt @@ -143,3 +143,8 @@ ११०२२३४५५६७~ग्यारह अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ ५१०२२३४५५६७~इक्यावन अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ २ पॉइंट्स १२ गोल~दो पॉइंट्स बारह गोल +०५~शून्य पाँच +०१~शून्य एक +०७३~शून्य सात तीन +०००१~शून्य शून्य शून्य एक +०००~शून्य शून्य शून्य \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt index 86a824f72..95186a60d 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt @@ -64,3 +64,10 @@ ५x५ का सोफ़ा~पाँच बाई पाँच का सोफ़ा २x२ रुबिक्स क्यूब~दो बाई दो रुबिक्स क्यूब १३x१३ का घर~तेरह बाई तेरह का घर +१००० yr~एक हज़ार वर्ष +९९९९ yr~नौ हज़ार नौ सौ निन्यानबे वर्ष +१६.०७ yr~सोलह दशमलव शून्य सात वर्ष +५ yr~पाँच साल +१.५ yr~डेढ़ साल +२.५ yr~ढाई साल +३.५ yr~साढ़े तीन साल diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt new file mode 100644 index 000000000..95184a603 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_ordinal.txt @@ -0,0 +1,72 @@ +१ला~पहला +१ली~पहली +२रा~दूसरा +२री~दूसरी +३रा~तीसरा +३री~तीसरी +४था~चौथा +४थी~चौथी +५वां~पाँचवां +५वीं~पाँचवीं +६ठा~छठा +६ठी~छठी +७वां~सातवां +७वीं~सातवीं +८वां~आठवां +८वीं~आठवीं +९वां~नौवां +९वीं~नौवीं +११वां~ग्यारहवां +१२वीं~बारहवीं +१४वां~चौदहवां +१६वीं~सोलहवीं +१७वां~सत्रहवां +१८वीं~अठारहवीं +१९वां~उन्नीसवां +२०वां~बीसवां +२१वां~इक्कीसवां +२५वीं~पच्चीसवीं +२७वें~सत्ताईसवें +३०वीं~तीसवीं +३३वां~तैंतीसवां +४०वीं~चालीसवीं +४५वां~पैंतालीसवां +५०वां~पचासवां +५६वें~छप्पनवें +६०वां~साठवां +६७वीं~सड़सठवीं +७५वीं~पचहत्तरवीं +८०वें~अस्सीवें +८८वां~अट्ठासीवां +९१वीं~इक्यानबेवीं +९९वां~निन्यानबेवां +१००वां~एक सौवां +१०१वां~एक सौ एकवां +१११वीं~एक सौ ग्यारहवीं +१२५वें~एक सौ पच्चीसवें +१५३वीं~एक सौ तिरेपनवीं +२००वीं~दो सौवीं +२१९वीं~दो सौ उन्नीसवीं +२४०वां~दो सौ चालीसवां +३२९वां~तीन सौ उनतीसवां +३६५वां~तीन सौ पैंसठवां +४५५वां~चार सौ पचपनवां +५५५वीं~पाँच सौ पचपनवीं +६४०वीं~छह सौ चालीसवीं +८९०वां~आठ सौ नब्बेवां +१००१वीं~एक हज़ार एकवीं +१०९१वें~एक हज़ार इक्यानबेवें +१७८२वीं~सत्रह सौ बयासीवीं +१८९०वां~एक हज़ार आठ सौ नब्बेवां +१९८१वीं~उन्नीस सौ इक्यासीवीं +९८२६वीं~अट्ठानबे सौ छब्बीसवीं +1st~फ़र्स्ट +2nd~सेकंड +3rd~थर्ड +4th~फ़ोर्थ +5th~फ़िफ्थ +6th~सिक्स्थ +7th~सेवंथ +8th~एटथ +9th~नाइंथ +10th~टेंथ diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_telephone.txt new file mode 100644 index 000000000..7a1b2c662 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_telephone.txt @@ -0,0 +1,25 @@ +मेरा पुराना नंबर था ९१५७११४००७~मेरा पुराना नंबर था शून्य नौ एक पाँच सात एक एक चार शून्य शून्य सात +इसपे कॉल करो ०३८६२-३५१७९१~इसपे कॉल करो शून्य तीन आठ छह दो तीन पाँच एक सात नौ एक +मेरे इस नंबर पे कॉल करो १३७४-३०९९८८~मेरे इस नंबर पे कॉल करो शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ +इसपे कॉल करो ०१६८९११-४५७३~इसपे कॉल करो शून्य एक छह आठ नौ एक एक चार पाँच सात तीन ++९१ ७४४०४३१०८३ मेरे इस नंबर पे कॉल करो~प्लस नौ एक सात चार चार शून्य चार तीन एक शून्य आठ तीन मेरे इस नंबर पे कॉल करो ++९१ ९२१०५१५६०६ मेरे इस नंबर पे कॉल करो~प्लस नौ एक नौ दो एक शून्य पाँच एक पाँच छह शून्य छह मेरे इस नंबर पे कॉल करो +भुगतान के लिए कार्ड के आखिरी अंक १२३४ दर्ज करें~भुगतान के लिए कार्ड के आखिरी अंक एक दो तीन चार दर्ज करें +मेरा पिन कोड ११००२३ है~मेरा पिन कोड एक एक शून्य शून्य दो तीन है +मेरा पुराना नंबर था 9157114007~मेरा पुराना नंबर था शून्य नौ एक पाँच सात एक एक चार शून्य शून्य सात +इसपे कॉल करो 03862-351791~इसपे कॉल करो शून्य तीन आठ छह दो तीन पाँच एक सात नौ एक +मेरे इस नंबर पे कॉल करो 1374 309988~मेरे इस नंबर पे कॉल करो शून्य एक तीन सात चार तीन शून्य नौ नौ आठ आठ +इसपे कॉल करो 0168911-4573~इसपे कॉल करो शून्य एक छह आठ नौ एक एक चार पाँच सात तीन ++91 7440431083 मेरे इस नंबर पे कॉल करो~प्लस नौ एक सात चार चार शून्य चार तीन एक शून्य आठ तीन मेरे इस नंबर पे कॉल करो ++91 9210515606 मेरे इस नंबर पे कॉल करो~प्लस नौ एक नौ दो एक शून्य पाँच एक पाँच छह शून्य छह मेरे इस नंबर पे कॉल करो +भुगतान के लिए कार्ड के आखिरी अंक 1234 दर्ज करें~भुगतान के लिए कार्ड के आखिरी अंक एक दो तीन चार दर्ज करें +मेरा पिन कोड 110023 है~मेरा पिन कोड एक एक शून्य शून्य दो तीन है ++1 9210515606 मेरे इस नंबर पे कॉल करो~प्लस एक नौ दो एक शून्य पाँच एक पाँच छह शून्य छह मेरे इस नंबर पे कॉल करो ++४९ ९२१०५१५६०६ मेरे इस नंबर पे कॉल करो~प्लस चार नौ नौ दो एक शून्य पाँच एक पाँच छह शून्य छह मेरे इस नंबर पे कॉल करो ++353 9210515606 मेरे इस नंबर पे कॉल करो~प्लस तीन पाँच तीन नौ दो एक शून्य पाँच एक पाँच छह शून्य छह मेरे इस नंबर पे कॉल करो ++91 9876543210 123~प्लस नौ एक नौ आठ सात छह पाँच चार तीन दो एक शून्य एक दो तीन ++1 6234517890 123~प्लस एक छह दो तीन चार पाँच एक सात आठ नौ शून्य एक दो तीन ++९१ ९८७६५४३२१० १२३~प्लस नौ एक नौ आठ सात छह पाँच चार तीन दो एक शून्य एक दो तीन +(02229) 411128~शून्य दो दो दो नौ चार एक एक एक दो आठ +०२२.२९४१११२८~शून्य दो दो दो नौ चार एक एक एक दो आठ +0 (80) 26411128~शून्य आठ शून्य दो छह चार एक एक एक दो आठ \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_address.py b/tests/nemo_text_processing/hi/test_address.py new file mode 100644 index 000000000..41b905f11 --- /dev/null +++ b/tests/nemo_text_processing/hi/test_address.py @@ -0,0 +1,33 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestAddress: + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True + ) + + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_address.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=True) + assert pred == expected diff --git a/tests/nemo_text_processing/hi/test_ordinal.py b/tests/nemo_text_processing/hi/test_ordinal.py index b65252694..3e5f4bfbb 100644 --- a/tests/nemo_text_processing/hi/test_ordinal.py +++ b/tests/nemo_text_processing/hi/test_ordinal.py @@ -17,13 +17,24 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file class TestOrdinal: + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False + ) inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_ordinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_ordinal.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh index 498443f71..621383a8d 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh @@ -76,15 +76,15 @@ testTNMoney() { runtest $input } -#testTNOrdinal() { -# input=$PROJECT_DIR/hi/data_text_normalization/test_cases_ordinal.txt -# runtest $input -#} +testTNOrdinal() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_ordinal.txt + runtest $input +} -#testTNTelephone() { -# input=$PROJECT_DIR/en/data_text_normalization/test_cases_telephone.txt -# runtest $input -#} +testTNTelephone() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_telephone.txt + runtest $input +} testTNTime() { input=$PROJECT_DIR/hi/data_text_normalization/test_cases_time.txt @@ -106,10 +106,10 @@ testTNWord() { runtest $input } -#testTNAddress() { -# input=$PROJECT_DIR/en/data_text_normalization/test_cases_address.txt -# runtest $input -#} +testTNAddress() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_address.txt + runtest $input +} #testTNMath() { # input=$PROJECT_DIR/en/data_text_normalization/test_cases_math.txt diff --git a/tests/nemo_text_processing/hi/test_telephone.py b/tests/nemo_text_processing/hi/test_telephone.py index 7e43f7e82..e7b9f1c3d 100644 --- a/tests/nemo_text_processing/hi/test_telephone.py +++ b/tests/nemo_text_processing/hi/test_telephone.py @@ -16,12 +16,16 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file class TestTelephone: inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True + ) @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_telephone.txt')) @pytest.mark.run_only_on('CPU') @@ -29,3 +33,10 @@ class TestTelephone: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred.strip() == expected.strip() + + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_telephone.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=True) + assert pred == expected