From 4898c83e9ee689aee8cd0ffb1b6ad82988b507ac Mon Sep 17 00:00:00 2001 From: jaya-TN Date: Thu, 4 Jun 2026 13:57:45 +0530 Subject: [PATCH 1/6] feat(te): add cardinal ITN tagger, verbalizer and test cases Signed-off-by: jaya-TN --- .../inverse_text_normalization/te/__init__.py | 17 ++ .../te/data/numbers/digit.tsv | 9 + .../te/data/numbers/teens_and_ties.tsv | 19 ++ .../te/data/numbers/zero.tsv | 1 + .../te/graph_utils.py | 204 ++++++++++++++++++ .../te/taggers/__init__.py | 13 ++ .../te/taggers/cardinal.py | 30 +++ .../inverse_text_normalization/te/utils.py | 63 ++++++ .../te/verbalizers/__init__.py | 13 ++ .../te/verbalizers/cardinal.py | 30 +++ run_cardinal_tests.py | 94 ++++++++ test_cases_cardinal.txt | 29 +++ 12 files changed, 522 insertions(+) create mode 100644 nemo_text_processing/inverse_text_normalization/te/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/te/data/numbers/digit.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/te/data/numbers/teens_and_ties.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/te/data/numbers/zero.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/te/graph_utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/te/taggers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/te/taggers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/te/utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/te/verbalizers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/te/verbalizers/cardinal.py create mode 100644 run_cardinal_tests.py create mode 100644 test_cases_cardinal.txt diff --git a/nemo_text_processing/inverse_text_normalization/te/__init__.py b/nemo_text_processing/inverse_text_normalization/te/__init__.py new file mode 100644 index 000000000..b650cbece --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/te/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.hi.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/te/data/numbers/digit.tsv b/nemo_text_processing/inverse_text_normalization/te/data/numbers/digit.tsv new file mode 100644 index 000000000..6645fcf9e --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/te/data/numbers/digit.tsv @@ -0,0 +1,9 @@ +1 ఒకటి +2 రెండు +3 మూడు +4 నాలుగు +5 అయిదు +6 ఆరు +7 ఏడు +8 ఎనిమిది +9 తొమ్మిది \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/te/data/numbers/teens_and_ties.tsv b/nemo_text_processing/inverse_text_normalization/te/data/numbers/teens_and_ties.tsv new file mode 100644 index 000000000..e74f2ac2f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/te/data/numbers/teens_and_ties.tsv @@ -0,0 +1,19 @@ +10 పది +11 పదకొండు +12 పన్నెండు +13 పదమూడు +14 పధ్నాలుగు +15 పదునయిదు +16 పదహారు +17 పదిహేడు +18 పధ్ధెనిమిది +19 పందొమ్మిది +20 ఇరవై +30 ముప్పై +40 నలభై +50 యాభై +60 అరవై +70 డెబ్బై +80 ఎనభై +90 తొంభై +(The file `/Users/e.saijayasri/Desktop/repo/NeMo-text-processing/nemo_text_processing/inverse_text_normalization/te/data/numbers/teens_and_ties.tsv` exists, but is empty) diff --git a/nemo_text_processing/inverse_text_normalization/te/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/te/data/numbers/zero.tsv new file mode 100644 index 000000000..9a7443dfe --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/te/data/numbers/zero.tsv @@ -0,0 +1 @@ +0 సున్న diff --git a/nemo_text_processing/inverse_text_normalization/te/graph_utils.py b/nemo_text_processing/inverse_text_normalization/te/graph_utils.py new file mode 100644 index 000000000..b002efa52 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/te/graph_utils.py @@ -0,0 +1,204 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright 2024 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.examples import plurals +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path, load_labels + +NEMO_CHAR = utf8.VALID_UTF8_CHAR + +graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + +NEMO_HI_DIGIT = pynini.union("०", "१", "२", "३", "४", "५", "६", "७", "८", "९").optimize() +DEVANAGARI_DIGIT = ["०", "१", "२", "३", "४", "५", "६", "७", "८", "९"] + +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_NON_BREAKING_SPACE = u"\u00a0" +NEMO_ZWNJ = u"\u200c" +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00a0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_CHAR, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"")) +) + + +MIN_NEG_WEIGHT = -0.0001 +MIN_POS_WEIGHT = 0.0001 +INPUT_CASED = "cased" +INPUT_LOWER_CASED = "lower_cased" +MINUS = pynini.union("ऋणात्मक", "नकारात्मक").optimize() + + +def integer_to_devanagari(n: int) -> str: + return ''.join(DEVANAGARI_DIGIT[int(d)] for d in str(n)) + + +def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logging.info(f'Created {file_name}') + + +def convert_space(fst) -> 'pynini.FstLike': + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) + + +def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): + labels = load_labels(input_file) + + if input_case == INPUT_CASED: + additional_labels = [] + for written, spoken, *weight in labels: + written_capitalized = written[0].upper() + written[1:] + additional_labels.extend( + [ + [written_capitalized, spoken.capitalize()], # first letter capitalized + [ + written_capitalized, + spoken.upper().replace(" AND ", " and "), + ], # # add pairs with the all letters capitalized + ] + ) + + spoken_no_space = spoken.replace(" ", "") + # add abbreviations without spaces (both lower and upper case), i.e. "BMW" not "B M W" + if len(spoken) == (2 * len(spoken_no_space) - 1): + logging.debug(f"This is weight {weight}") + if len(weight) == 0: + additional_labels.extend( + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]] + ) + else: + additional_labels.extend( + [ + [written, spoken_no_space, weight[0]], + [written_capitalized, spoken_no_space.upper(), weight[0]], + ] + ) + labels += additional_labels + + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far') + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> 'pynini.FstLike': + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> 'pynini.FstLike': + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> 'pynini.FstLike': + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/te/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/te/taggers/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/te/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/te/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/te/taggers/cardinal.py new file mode 100644 index 000000000..f23555d04 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/te/taggers/cardinal.py @@ -0,0 +1,30 @@ +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.te.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.te.utils import get_abs_path + + +class CardinalFst(GraphFst): + """ + Classifies spoken Telugu numbers back to digits, + e.g. 'ఒకటి' -> cardinal { integer: "1" } + """ + + def __init__(self): + super().__init__(name="cardinal", kind="classify") + + # Load TSV files and invert them (word -> digit) + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() + graph_teens_and_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert() + + # Combine all graphs + graph = graph_digit | graph_zero | graph_teens_and_ties + graph = graph.optimize() + + # Wrap with token labels + final_graph = pynutil.insert('integer: "') + graph + pynutil.insert('"') + final_graph = self.add_tokens(final_graph) + + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/te/utils.py b/nemo_text_processing/inverse_text_normalization/te/utils.py new file mode 100644 index 000000000..8e3f62c3c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/te/utils.py @@ -0,0 +1,63 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import logging +import os +import pynini + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + abs_path = os.path.dirname(os.path.abspath(__file__)) + os.sep + rel_path + + if not os.path.exists(abs_path): + logging.warning(f'{abs_path} does not exist') + return abs_path + + +def load_labels(abs_path): + """ + loads relative path file as dictionary + + Args: + abs_path: absolute path + + Returns dictionary of mappings + """ + label_tsv = open(abs_path, encoding="utf-8") + labels = list(csv.reader(label_tsv, delimiter="\t")) + return labels + + +from pynini.lib import pynutil + + +def apply_fst(text, fst): + """Given a string input, returns the output string + produced by traversing the path with lowest weight. + If no valid path accepts input string, returns an + error. + """ + try: + print(pynini.shortestpath(text @ fst).string()) + except pynini.FstOpError: + print(f"Error: No valid output with given input: '{text}'") diff --git a/nemo_text_processing/inverse_text_normalization/te/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/te/verbalizers/__init__.py new file mode 100644 index 000000000..d9155f923 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/te/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/te/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/te/verbalizers/cardinal.py new file mode 100644 index 000000000..f9d4af0a4 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/te/verbalizers/cardinal.py @@ -0,0 +1,30 @@ +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.te.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_space, +) + + +class CardinalFst(GraphFst): + """ + Verbalizes Telugu digits, + e.g. cardinal { integer: "5" } -> 5 + """ + + def __init__(self): + super().__init__(name="cardinal", kind="verbalize") + + # Keep digits between quotes (1+ non-quote chars) + graph = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/run_cardinal_tests.py b/run_cardinal_tests.py new file mode 100644 index 000000000..93cd74107 --- /dev/null +++ b/run_cardinal_tests.py @@ -0,0 +1,94 @@ +import sys +import importlib +import pynini + + +def apply_fst(fst, text): + lattice = text @ fst + if lattice.num_states() == 0: + return None + + out = pynini.shortestpath(lattice) + return out.string() if out.num_states() else None + + +def main(): + if len(sys.argv) < 4: + print('Usage: python run_cardinal_tests.py ') + sys.exit(2) + + lang, direction, path = sys.argv[1], sys.argv[2], sys.argv[3] + base = 'text_normalization' if direction == 'tn' else 'inverse_text_normalization' + + tagger = importlib.import_module(f'nemo_text_processing.{base}.{lang}.taggers.cardinal').CardinalFst().fst + verbalizer = importlib.import_module(f'nemo_text_processing.{base}.{lang}.verbalizers.cardinal').CardinalFst().fst + + passed = failed = 0 + with open(path, encoding='utf-8') as f: + for line in f: + line = line.rstrip('\n') + if not line.strip() or '~' not in line: + continue + inp, expected = [s.strip() for s in line.split('~', 1)] + tagged = apply_fst(tagger, inp) + result = apply_fst(verbalizer, tagged) if tagged is not None else None + if result == expected: + passed += 1 + else: + failed += 1 + print(f"FAIL: {inp!r} -> got {result!r}, expected {expected!r}") + + print(f"\n{passed} passed, {failed} failed.") + sys.exit(1 if failed else 0) + + +if __name__ == '__main__': + main() +# run_cardinal_tests.py -- simple checker for the Cardinal exercise. +# Usage (from the repo root, inside your conda env): +# python run_cardinal_tests.py +# Example: +# python run_cardinal_tests.py LANGCODE DIRECTION test_cases_cardinal.txt +import sys +import importlib +import pynini + + +def apply_fst(fst, text): + lattice = text @ fst + if lattice.num_states() == 0: + return None # input was rejected by the grammar + out = pynini.shortestpath(lattice) + return out.string() if out.num_states() else None + + +def main(): + lang, direction, path = sys.argv[1], sys.argv[2], sys.argv[3] + base = "text_normalization" if direction == "tn" else "inverse_text_normalization" + tagger = importlib.import_module( + f"nemo_text_processing.{base}.{lang}.taggers.cardinal").CardinalFst().fst + verbalizer = importlib.import_module( + f"nemo_text_processing.{base}.{lang}.verbalizers.cardinal").CardinalFst().fst + + passed = failed = 0 + with open(path, encoding="utf-8") as f: + for line in f: + line = line.rstrip("\n") + if not line.strip() or "~" not in line: + continue + inp, expected = [s.strip() for s in line.split("~", 1)] + tagged = apply_fst(tagger, inp) + result = apply_fst(verbalizer, tagged) if tagged is not None else None + if result == expected: + passed += 1 + else: + failed += 1 + print(f"FAIL: {inp!r} -> got {result!r}, expected {expected!r}") + + print(f"\n{passed} passed, {failed} failed.") + sys.exit(1 if failed else 0) + + +if __name__ == "__main__": + main() + \ No newline at end of file diff --git a/test_cases_cardinal.txt b/test_cases_cardinal.txt new file mode 100644 index 000000000..aa398c460 --- /dev/null +++ b/test_cases_cardinal.txt @@ -0,0 +1,29 @@ +సున్న~0 +ఒకటి~1 +రెండు~2 +మూడు~3 +నాలుగు~4 +అయిదు~5 +ఆరు~6 +ఏడు~7 +ఎనిమిది~8 +తొమ్మిది~9 +పది~10 +పదకొండు~11 +పన్నెండు~12 +పదమూడు~13 +పధ్నాలుగు~14 +పదునయిదు~15 +పదహారు~16 +పదిహేడు~17 +పధ్ధెనిమిది~18 +పందొమ్మిది~19 +ఇరవై~20 +ముప్పై~30 +నలభై~40 +యాభై~50 +అరవై~60 +డెబ్బై~70 +ఎనభై~80 +తొంభై~90 + From cccd9f7cb32440aed6dc5336d103845b97702db6 Mon Sep 17 00:00:00 2001 From: jaya-TN Date: Fri, 5 Jun 2026 15:31:36 +0530 Subject: [PATCH 2/6] feat(te): add stretch goals - compound numbers (21-99), hundreds (100-999), and native Telugu numerals Signed-off-by: jaya-TN --- .../te/data/numbers/compound.tsv | 72 +++++++++++ .../te/data/numbers/digit.tsv | 11 +- .../te/data/numbers/hundreds.tsv | 121 ++++++++++++++++++ .../te/data/numbers/zero.tsv | 1 + .../te/taggers/cardinal.py | 4 +- .../te/verbalizers/cardinal.py | 6 +- test_cases_cardinal.txt | 17 ++- 7 files changed, 224 insertions(+), 8 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/te/data/numbers/compound.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/te/data/numbers/hundreds.tsv diff --git a/nemo_text_processing/inverse_text_normalization/te/data/numbers/compound.tsv b/nemo_text_processing/inverse_text_normalization/te/data/numbers/compound.tsv new file mode 100644 index 000000000..0ff2d5be7 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/te/data/numbers/compound.tsv @@ -0,0 +1,72 @@ +21 ఇరవై ఒకటి +22 ఇరవై రెండు +23 ఇరవై మూడు +24 ఇరవై నాలుగు +25 ఇరవై అయిదు +26 ఇరవై ఆరు +27 ఇరవై ఏడు +28 ఇరవై ఎనిమిది +29 ఇరవై తొమ్మిది +31 ముప్పై ఒకటి +32 ముప్పై రెండు +33 ముప్పై మూడు +34 ముప్పై నాలుగు +35 ముప్పై అయిదు +36 ముప్పై ఆరు +37 ముప్పై ఏడు +38 ముప్పై ఎనిమిది +39 ముప్పై తొమ్మిది +41 నలభై ఒకటి +42 నలభై రెండు +43 నలభై మూడు +44 నలభై నాలుగు +45 నలభై అయిదు +46 నలభై ఆరు +47 నలభై ఏడు +48 నలభై ఎనిమిది +49 నలభై తొమ్మిది +51 యాభై ఒకటి +52 యాభై రెండు +53 యాభై మూడు +54 యాభై నాలుగు +55 యాభై అయిదు +56 యాభై ఆరు +57 యాభై ఏడు +58 యాభై ఎనిమిది +59 యాభై తొమ్మిది +61 అరవై ఒకటి +62 అరవై రెండు +63 అరవై మూడు +64 అరవై నాలుగు +65 అరవై అయిదు +66 అరవై ఆరు +67 అరవై ఏడు +68 అరవై ఎనిమిది +69 అరవై తొమ్మిది +71 డెబ్బై ఒకటి +72 డెబ్బై రెండు +73 డెబ్బై మూడు +74 డెబ్బై నాలుగు +75 డెబ్బై అయిదు +76 డెబ్బై ఆరు +77 డెబ్బై ఏడు +78 డెబ్బై ఎనిమిది +79 డెబ్బై తొమ్మిది +81 ఎనభై ఒకటి +82 ఎనభై రెండు +83 ఎనభై మూడు +84 ఎనభై నాలుగు +85 ఎనభై అయిదు +86 ఎనభై ఆరు +87 ఎనభై ఏడు +88 ఎనభై ఎనిమిది +89 ఎనభై తొమ్మిది +91 తొంభై ఒకటి +92 తొంభై రెండు +93 తొంభై మూడు +94 తొంభై నాలుగు +95 తొంభై అయిదు +96 తొంభై ఆరు +97 తొంభై ఏడు +98 తొంభై ఎనిమిది +99 తొంభై తొమ్మిది diff --git a/nemo_text_processing/inverse_text_normalization/te/data/numbers/digit.tsv b/nemo_text_processing/inverse_text_normalization/te/data/numbers/digit.tsv index 6645fcf9e..cd00fa558 100644 --- a/nemo_text_processing/inverse_text_normalization/te/data/numbers/digit.tsv +++ b/nemo_text_processing/inverse_text_normalization/te/data/numbers/digit.tsv @@ -6,4 +6,13 @@ 6 ఆరు 7 ఏడు 8 ఎనిమిది -9 తొమ్మిది \ No newline at end of file +9 తొమ్మిది +1 ౧ +2 ౨ +3 ౩ +4 ౪ +5 ౫ +6 ౬ +7 ౭ +8 ౮ +9 ౯ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/te/data/numbers/hundreds.tsv b/nemo_text_processing/inverse_text_normalization/te/data/numbers/hundreds.tsv new file mode 100644 index 000000000..ce6c8d022 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/te/data/numbers/hundreds.tsv @@ -0,0 +1,121 @@ +100 వంద +101 వంద ఒకటి +102 వంద రెండు +103 వంద మూడు +104 వంద నాలుగు +105 వంద అయిదు +106 వంద ఆరు +107 వంద ఏడు +108 వంద ఎనిమిది +109 వంద తొమ్మిది +110 వంద పది +111 వంద పదకొండు +112 వంద పన్నెండు +113 వంద పదమూడు +114 వంద పధ్నాలుగు +115 వంద పదునయిదు +116 వంద పదహారు +117 వంద పదిహేడు +118 వంద పధ్ధెనిమిది +119 వంద పందొమ్మిది +120 వంద ఇరవై +200 రెండు వంద +201 రెండు వంద ఒకటి +202 రెండు వంద రెండు +203 రెండు వంద మూడు +204 రెండు వంద నాలుగు +205 రెండు వంద అయిదు +206 రెండు వంద ఆరు +207 రెండు వంద ఏడు +208 రెండు వంద ఎనిమిది +209 రెండు వంద తొమ్మిది +210 రెండు వంద పది +300 మూడు వంద +301 మూడు వంద ఒకటి +302 మూడు వంద రెండు +303 మూడు వంద మూడు +304 మూడు వంద నాలుగు +305 మూడు వంద అయిదు +306 మూడు వంద ఆరు +307 మూడు వంద ఏడు +308 మూడు వంద ఎనిమిది +309 మూడు వంద తొమ్మిది +310 మూడు వంద పది +400 నాలుగు వంద +401 నాలుగు వంద ఒకటి +402 నాలుగు వంద రెండు +403 నాలుగు వంద మూడు +404 నాలుగు వంద నాలుగు +405 నాలుగు వంద అయిదు +406 నాలుగు వంద ఆరు +407 నాలుగు వంద ఏడు +408 నాలుగు వంద ఎనిమిది +409 నాలుగు వంద తొమ్మిది +410 నాలుగు వంద పది +500 అయిదు వంద +501 అయిదు వంద ఒకటి +502 అయిదు వంద రెండు +503 అయిదు వంద మూడు +504 అయిదు వంద నాలుగు +505 అయిదు వంద అయిదు +506 అయిదు వంద ఆరు +507 అయిదు వంద ఏడు +508 అయిదు వంద ఎనిమిది +509 అయిదు వంద తొమ్మిది +510 అయిదు వంద పది +600 ఆరు వంద +601 ఆరు వంద ఒకటి +602 ఆరు వంద రెండు +603 ఆరు వంద మూడు +604 ఆరు వంద నాలుగు +605 ఆరు వంద అయిదు +606 ఆరు వంద ఆరు +607 ఆరు వంద ఏడు +608 ఆరు వంద ఎనిమిది +609 ఆరు వంద తొమ్మిది +610 ఆరు వంద పది +700 ఏడు వంద +701 ఏడు వంద ఒకటి +702 ఏడు వంద రెండు +703 ఏడు వంద మూడు +704 ఏడు వంద నాలుగు +705 ఏడు వంద అయిదు +706 ఏడు వంద ఆరు +707 ఏడు వంద ఏడు +708 ఏడు వంద ఎనిమిది +709 ఏడు వంద తొమ్మిది +710 ఏడు వంద పది +800 ఎనిమిది వంద +801 ఎనిమిది వంద ఒకటి +802 ఎనిమిది వంద రెండు +803 ఎనిమిది వంద మూడు +804 ఎనిమిది వంద నాలుగు +805 ఎనిమిది వంద అయిదు +806 ఎనిమిది వంద ఆరు +807 ఎనిమిది వంద ఏడు +808 ఎనిమిది వంద ఎనిమిది +809 ఎనిమిది వంద తొమ్మిది +810 ఎనిమిది వంద పది +900 తొమ్మిది వంద +901 తొమ్మిది వంద ఒకటి +902 తొమ్మిది వంద రెండు +903 తొమ్మిది వంద మూడు +904 తొమ్మిది వంద నాలుగు +905 తొమ్మిది వంద అయిదు +906 తొమ్మిది వంద ఆరు +907 తొమ్మిది వంద ఏడు +908 తొమ్మిది వంద ఎనిమిది +909 తొమ్మిది వంద తొమ్మిది +910 తొమ్మిది వంద పది +911 తొమ్మిది వంద పదకొండు +920 తొమ్మిది వంద ఇరవై +921 తొమ్మిది వంద ఇరవై ఒకటి +930 తొమ్మిది వంద ముప్పై +931 తొమ్మిది వంద ముప్పై ఒకటి +940 తొమ్మిది వంద నలభై +950 తొమ్మిది వంద యాభై +960 తొమ్మిది వంద అరవై +970 తొమ్మిది వంద డెబ్బై +980 తొమ్మిది వంద ఎనభై +990 తొమ్మిది వంద తొంభై +999 తొమ్మిది వంద తొంభై తొమ్మిది diff --git a/nemo_text_processing/inverse_text_normalization/te/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/te/data/numbers/zero.tsv index 9a7443dfe..fdb4bf4c6 100644 --- a/nemo_text_processing/inverse_text_normalization/te/data/numbers/zero.tsv +++ b/nemo_text_processing/inverse_text_normalization/te/data/numbers/zero.tsv @@ -1 +1,2 @@ 0 సున్న +0 ౦ diff --git a/nemo_text_processing/inverse_text_normalization/te/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/te/taggers/cardinal.py index f23555d04..07ea03452 100644 --- a/nemo_text_processing/inverse_text_normalization/te/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/te/taggers/cardinal.py @@ -18,9 +18,11 @@ def __init__(self): graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() graph_teens_and_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert() + graph_compound = pynini.string_file(get_abs_path("data/numbers/compound.tsv")).invert() + graph_hundreds = pynini.string_file(get_abs_path("data/numbers/hundreds.tsv")).invert() # Combine all graphs - graph = graph_digit | graph_zero | graph_teens_and_ties + graph = graph_digit | graph_zero | graph_teens_and_ties | graph_compound | graph_hundreds graph = graph.optimize() # Wrap with token labels diff --git a/nemo_text_processing/inverse_text_normalization/te/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/te/verbalizers/cardinal.py index f9d4af0a4..55744cfc0 100644 --- a/nemo_text_processing/inverse_text_normalization/te/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/te/verbalizers/cardinal.py @@ -1,11 +1,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.te.graph_utils import ( - NEMO_NOT_QUOTE, - GraphFst, - delete_space, -) +from nemo_text_processing.inverse_text_normalization.te.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space class CardinalFst(GraphFst): diff --git a/test_cases_cardinal.txt b/test_cases_cardinal.txt index aa398c460..a6c31918d 100644 --- a/test_cases_cardinal.txt +++ b/test_cases_cardinal.txt @@ -26,4 +26,19 @@ డెబ్బై~70 ఎనభై~80 తొంభై~90 - +ఇరవై ఒకటి~21 +ముప్పై అయిదు~35 +తొంభై తొమ్మిది~99 +వంద~100 +రెండు వంద ఒకటి~201 +తొమ్మిది వంద తొంభై తొమ్మిది~999 +౦~0 +౧~1 +౨~2 +౩~3 +౪~4 +౫~5 +౬~6 +౭~7 +౮~8 +౯~9 From a0b6394d4785981584119e87e87b21ad9caf664b Mon Sep 17 00:00:00 2001 From: jaya-TN Date: Fri, 5 Jun 2026 16:17:02 +0530 Subject: [PATCH 3/6] deleted one line Signed-off-by: jaya-TN --- .../te/data/numbers/teens_and_ties.tsv | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/te/data/numbers/teens_and_ties.tsv b/nemo_text_processing/inverse_text_normalization/te/data/numbers/teens_and_ties.tsv index e74f2ac2f..b1c558e12 100644 --- a/nemo_text_processing/inverse_text_normalization/te/data/numbers/teens_and_ties.tsv +++ b/nemo_text_processing/inverse_text_normalization/te/data/numbers/teens_and_ties.tsv @@ -16,4 +16,3 @@ 70 డెబ్బై 80 ఎనభై 90 తొంభై -(The file `/Users/e.saijayasri/Desktop/repo/NeMo-text-processing/nemo_text_processing/inverse_text_normalization/te/data/numbers/teens_and_ties.tsv` exists, but is empty) From b5d525d7d875276dab1b112e525946929c2f5f6e Mon Sep 17 00:00:00 2001 From: Mariana <47233618+mgrafu@users.noreply.github.com> Date: Fri, 5 Jun 2026 11:54:28 -0400 Subject: [PATCH 4/6] Add hi_en Code Switched (#415) (#426) * Add hi_en Code Switched (#415) * Add hi_en Code Switched Signed-off-by: RajanPutty * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Address PR #415 review: restore ko, dedupe whitelists, expand hi_en tests, add hi_en CI Signed-off-by: Rajan Putty * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: RajanPutty Signed-off-by: Rajan Putty Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * update jenkins Signed-off-by: Mariana Graterol Fuenmayor * Add hi_en Code Switched (#415) * Add hi_en Code Switched Signed-off-by: RajanPutty * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Address PR #415 review: restore ko, dedupe whitelists, expand hi_en tests, add hi_en CI Signed-off-by: Rajan Putty * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: RajanPutty Signed-off-by: Rajan Putty Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * update jenkins Signed-off-by: Mariana Graterol Fuenmayor * fix jenkins bug Signed-off-by: Mariana Graterol Fuenmayor * separate cache dirs Signed-off-by: Mariana Graterol Fuenmayor * refresh cache dirs Signed-off-by: Mariana Graterol Fuenmayor * refresh cache dir ko Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: RajanPutty Signed-off-by: Rajan Putty Signed-off-by: Mariana <47233618+mgrafu@users.noreply.github.com> Signed-off-by: Mariana Graterol Fuenmayor Co-authored-by: RajanPutty Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- Jenkinsfile | 19 +- .../hi_en/__init__.py | 17 ++ .../hi_en/graph_utils.py | 13 ++ .../hi_en/taggers/__init__.py | 13 ++ .../hi_en/taggers/tokenize_and_classify.py | 173 ++++++++++++++++++ .../inverse_text_normalization/hi_en/utils.py | 27 +++ .../hi_en/verbalizers/__init__.py | 13 ++ .../hi_en/verbalizers/verbalize.py | 102 +++++++++++ .../hi_en/verbalizers/verbalize_final.py | 44 +++++ .../inverse_normalize.py | 8 +- .../run_evaluate.py | 20 +- tests/nemo_text_processing/hi_en/__init__.py | 13 ++ .../test_cases_address.txt | 41 +++++ .../test_cases_cardinal.txt | 61 ++++++ .../test_cases_date.txt | 37 ++++ .../test_cases_date_cased.txt | 70 +++++++ .../test_cases_decimal.txt | 25 +++ .../test_cases_electronic.txt | 17 ++ .../test_cases_fraction.txt | 39 ++++ .../test_cases_measure.txt | 25 +++ .../test_cases_money.txt | 41 +++++ .../test_cases_ordinal.txt | 26 +++ .../test_cases_telephone.txt | 39 ++++ .../test_cases_time.txt | 29 +++ .../test_cases_whitelist.txt | 17 ++ .../test_cases_word.txt | 19 ++ .../hi_en/test_address.py | 38 ++++ .../hi_en/test_cardinal.py | 48 +++++ tests/nemo_text_processing/hi_en/test_date.py | 48 +++++ .../hi_en/test_decimal.py | 48 +++++ .../hi_en/test_electronic.py | 41 +++++ .../hi_en/test_fraction.py | 38 ++++ .../hi_en/test_measure.py | 48 +++++ .../nemo_text_processing/hi_en/test_money.py | 48 +++++ .../hi_en/test_ordinal.py | 48 +++++ ..._sparrowhawk_inverse_text_normalization.sh | 102 +++++++++++ .../hi_en/test_telephone.py | 41 +++++ tests/nemo_text_processing/hi_en/test_time.py | 48 +++++ .../hi_en/test_whitelist.py | 48 +++++ tests/nemo_text_processing/hi_en/test_word.py | 48 +++++ .../pynini_export.py | 10 +- 41 files changed, 1642 insertions(+), 8 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/graph_utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/taggers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/taggers/tokenize_and_classify.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/verbalize.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/verbalize_final.py create mode 100644 tests/nemo_text_processing/hi_en/__init__.py create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_address.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_cardinal.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_date.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_date_cased.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_decimal.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_electronic.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_fraction.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_measure.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_money.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_ordinal.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_telephone.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_time.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_whitelist.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_word.txt create mode 100644 tests/nemo_text_processing/hi_en/test_address.py create mode 100644 tests/nemo_text_processing/hi_en/test_cardinal.py create mode 100644 tests/nemo_text_processing/hi_en/test_date.py create mode 100644 tests/nemo_text_processing/hi_en/test_decimal.py create mode 100644 tests/nemo_text_processing/hi_en/test_electronic.py create mode 100644 tests/nemo_text_processing/hi_en/test_fraction.py create mode 100644 tests/nemo_text_processing/hi_en/test_measure.py create mode 100644 tests/nemo_text_processing/hi_en/test_money.py create mode 100644 tests/nemo_text_processing/hi_en/test_ordinal.py create mode 100644 tests/nemo_text_processing/hi_en/test_sparrowhawk_inverse_text_normalization.sh create mode 100644 tests/nemo_text_processing/hi_en/test_telephone.py create mode 100644 tests/nemo_text_processing/hi_en/test_time.py create mode 100644 tests/nemo_text_processing/hi_en/test_whitelist.py create mode 100644 tests/nemo_text_processing/hi_en/test_word.py diff --git a/Jenkinsfile b/Jenkinsfile index 23c99d3ad..4ac66d02f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -12,9 +12,10 @@ pipeline { environment { AR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-24-24-0' DE_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-23-24-0' - EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-25-25-0' + EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-26-3' ES_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-25-24-0' ES_EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/08-30-24-0' + HI_EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-26-4' FR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-07-25-0' HU_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/07-16-24-0' PT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-01-26-1' @@ -27,8 +28,8 @@ pipeline { HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' - KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-26-5' + KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-25-6' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { @@ -104,7 +105,11 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hi --text="एक" --cache_dir ${HI_TN_CACHE}' } } - + stage('L0: Codeswitched HI/EN ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hi_en --text="एक" --cache_dir ${HI_EN_TN_CACHE}' + } + } } } @@ -168,7 +173,6 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ar --text="اثنان " --cache_dir ${AR_TN_CACHE}' } } - } } @@ -409,6 +413,11 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/es_en/ -m "not pleasefixme" --cpu --tn_cache_dir ${ES_EN_TN_CACHE}' } } + stage('L1: Run all Codeswitched HI/EN TN/ITN tests (restore grammars from cache)') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/hi_en/ -m "not pleasefixme" --cpu --tn_cache_dir ${HI_EN_TN_CACHE}' + } + } stage('L1: Run all AR TN/ITN tests (restore grammars from cache)') { steps { sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/ar/ -m "not pleasefixme" --cpu --tn_cache_dir ${AR_TN_CACHE}' diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/__init__.py b/nemo_text_processing/inverse_text_normalization/hi_en/__init__.py new file mode 100644 index 000000000..cfe932251 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.hi_en.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.hi_en.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.hi_en.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/graph_utils.py b/nemo_text_processing/inverse_text_normalization/hi_en/graph_utils.py new file mode 100644 index 000000000..4fc25d0d3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/graph_utils.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/hi_en/taggers/__init__.py new file mode 100644 index 000000000..4fc25d0d3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi_en/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..09d7f2b15 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/taggers/tokenize_and_classify.py @@ -0,0 +1,173 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.en.taggers.cardinal import CardinalFst as EnCardinalFst +from nemo_text_processing.inverse_text_normalization.en.taggers.date import DateFst as EnDateFst +from nemo_text_processing.inverse_text_normalization.en.taggers.decimal import DecimalFst as EnDecimalFst +from nemo_text_processing.inverse_text_normalization.en.taggers.electronic import ElectronicFst as EnElectronicFst +from nemo_text_processing.inverse_text_normalization.en.taggers.measure import MeasureFst as EnMeasureFst +from nemo_text_processing.inverse_text_normalization.en.taggers.money import MoneyFst as EnMoneyFst +from nemo_text_processing.inverse_text_normalization.en.taggers.ordinal import OrdinalFst as EnOrdinalFst +from nemo_text_processing.inverse_text_normalization.en.taggers.punctuation import PunctuationFst as EnPunctuationFst +from nemo_text_processing.inverse_text_normalization.en.taggers.telephone import TelephoneFst as EnTelephoneFst +from nemo_text_processing.inverse_text_normalization.en.taggers.time import TimeFst as EnTimeFst +from nemo_text_processing.inverse_text_normalization.en.taggers.whitelist import WhiteListFst as EnWhiteListFst +from nemo_text_processing.inverse_text_normalization.en.taggers.word import WordFst as EnWordFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.date import DateFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.money import MoneyFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.telephone import TelephoneFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_LOWER_CASED, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.utils.logging import logger + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with Hindi whitelist replacements. If None, defaults to the Hindi whitelist at + nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv + en_whitelist: path to a file with English whitelist replacements. If None, defaults to the English whitelist at + nemo_text_processing/inverse_text_normalization/en/data/whitelist.tsv + input_case: accepting either "lower_cased" or "cased" input. + """ + + def __init__( + self, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + en_whitelist: str = None, + input_case: str = INPUT_LOWER_CASED, + ): + super().__init__(name="tokenize_and_classify", kind="classify") + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"hi_en_itn_{input_case}.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logger.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logger.info(f"Creating ClassifyFst grammars.") + + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + ordinal = OrdinalFst(cardinal) + ordinal_graph = ordinal.fst + + decimal = DecimalFst(cardinal) + decimal_graph = decimal.fst + + fraction = FractionFst(cardinal) + fraction_graph = fraction.fst + + measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst + date_graph = DateFst(cardinal, ordinal).fst + word_graph = WordFst().fst + time_graph = TimeFst(cardinal).fst + money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst + whitelist_graph = WhiteListFst(input_file=whitelist).fst + punct_graph = PunctuationFst().fst + telephone_graph = TelephoneFst(cardinal).fst + + en_cardinal = EnCardinalFst(input_case=input_case) + en_cardinal_graph = en_cardinal.fst + + en_ordinal = EnOrdinalFst(cardinal=en_cardinal, input_case=input_case) + en_ordinal_graph = en_ordinal.fst + + en_decimal = EnDecimalFst(cardinal=en_cardinal, input_case=input_case) + en_decimal_graph = en_decimal.fst + + en_measure_graph = EnMeasureFst(cardinal=en_cardinal, decimal=en_decimal, input_case=input_case).fst + en_date_graph = EnDateFst(ordinal=en_ordinal, input_case=input_case).fst + en_word_graph = EnWordFst().fst + en_time_graph = EnTimeFst(input_case=input_case).fst + en_money_graph = EnMoneyFst(cardinal=en_cardinal, decimal=en_decimal, input_case=input_case).fst + en_whitelist_graph = EnWhiteListFst(input_file=en_whitelist, input_case=input_case).fst + en_punct_graph = EnPunctuationFst().fst + en_electronic_graph = EnElectronicFst(input_case=input_case).fst + en_telephone_graph = EnTelephoneFst(cardinal=en_cardinal, input_case=input_case).fst + + classify = ( + pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(en_whitelist_graph, 1.01) + | pynutil.add_weight(time_graph, 1.1) + | pynutil.add_weight(en_time_graph, 1.1) + | pynutil.add_weight(date_graph, 1.09) + | pynutil.add_weight(en_date_graph, 1.09) + | pynutil.add_weight(decimal_graph, 1.09) + | pynutil.add_weight(en_decimal_graph, 1.09) + | pynutil.add_weight(fraction_graph, 1.09) + | pynutil.add_weight(measure_graph, 1.6) + | pynutil.add_weight(en_measure_graph, 1.1) + | pynutil.add_weight(cardinal_graph, 1.6) + | pynutil.add_weight(en_cardinal_graph, 1.1) + | pynutil.add_weight(ordinal_graph, 1.6) + | pynutil.add_weight(en_ordinal_graph, 1.09) + | pynutil.add_weight(money_graph, 1.6) + | pynutil.add_weight(en_money_graph, 1.1) + | pynutil.add_weight(telephone_graph, 1.6) + | pynutil.add_weight(en_telephone_graph, 1.1) + | pynutil.add_weight(en_electronic_graph, 1.1) + | pynutil.add_weight(word_graph, 100) + | pynutil.add_weight(en_word_graph, 120) + ) + + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") + en_punct = ( + pynutil.insert("tokens { ") + pynutil.add_weight(en_punct_graph, weight=1.3) + pynutil.insert(" }") + ) + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + token_plus_punct = ( + pynini.closure(punct + pynutil.insert(" ")) + + token + + pynini.closure(pynutil.insert(" ") + punct | en_punct) + ) + + graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) + graph = delete_space + graph + delete_space + + self.fst = graph.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) + logger.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/utils.py b/nemo_text_processing/inverse_text_normalization/hi_en/utils.py new file mode 100644 index 000000000..2bcba780d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/utils.py @@ -0,0 +1,27 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/__init__.py new file mode 100644 index 000000000..4fc25d0d3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/verbalize.py new file mode 100644 index 000000000..81cc937a1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/verbalize.py @@ -0,0 +1,102 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.en.verbalizers.cardinal import CardinalFst as EnCardinalFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.date import DateFst as EnDateFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.decimal import DecimalFst as EnDecimalFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.electronic import ElectronicFst as EnElectronicFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.measure import MeasureFst as EnMeasureFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.money import MoneyFst as EnMoneyFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.ordinal import OrdinalFst as EnOrdinalFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.telephone import TelephoneFst as EnTelephoneFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.time import TimeFst as EnTimeFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.whitelist import WhiteListFst as EnWhiteListFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.date import DateFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.money import MoneyFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.telephone import TelephoneFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + """ + + def __init__(self): + super().__init__(name="verbalize", kind="verbalize") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + ordinal = OrdinalFst() + ordinal_graph = ordinal.fst + + decimal = DecimalFst() + decimal_graph = decimal.fst + + fraction_graph = FractionFst().fst + + date_graph = DateFst(cardinal, ordinal).fst + time_graph = TimeFst().fst + measure_graph = MeasureFst(cardinal, decimal).fst + money_graph = MoneyFst(cardinal, decimal).fst + telephone_graph = TelephoneFst(cardinal).fst + whitelist_graph = WhiteListFst().fst + + en_cardinal = EnCardinalFst() + en_cardinal_graph = en_cardinal.fst + en_ordinal_graph = EnOrdinalFst().fst + en_decimal = EnDecimalFst() + en_decimal_graph = en_decimal.fst + en_measure_graph = EnMeasureFst(decimal=en_decimal, cardinal=en_cardinal).fst + en_money_graph = EnMoneyFst(decimal=en_decimal).fst + en_date_graph = EnDateFst().fst + en_whitelist_graph = EnWhiteListFst().fst + en_telephone_graph = EnTelephoneFst().fst + en_time_graph = EnTimeFst().fst + en_electronic_graph = EnElectronicFst().fst + + graph = ( + en_time_graph + | pynutil.add_weight(time_graph, 1.1) + | date_graph + | pynutil.add_weight(en_date_graph, 1.1) + | money_graph + | pynutil.add_weight(en_money_graph, 1.1) + | fraction_graph + | measure_graph + | pynutil.add_weight(en_measure_graph, 1.1) + | ordinal_graph + | pynutil.add_weight(en_ordinal_graph, 1.1) + | decimal_graph + | pynutil.add_weight(en_decimal_graph, 1.1) + | cardinal_graph + | pynutil.add_weight(en_cardinal_graph, 1.1) + | whitelist_graph + | pynutil.add_weight(en_whitelist_graph, 1.1) + | telephone_graph + | pynutil.add_weight(en_telephone_graph, 1.1) + | en_electronic_graph + ) + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/verbalize_final.py new file mode 100644 index 000000000..05386f09d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/verbalize_final.py @@ -0,0 +1,44 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst +from nemo_text_processing.inverse_text_normalization.hi_en.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now + """ + + def __init__(self): + super().__init__(name="verbalize_final", kind="verbalize") + verbalize = VerbalizeFst().fst + word = WordFst().fst + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index 1ab727660..9a6fcc64c 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -101,7 +101,7 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ar.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) - elif lang == 'es_en': # Arabic + elif lang == 'es_en': # Spanish-English code-switch from nemo_text_processing.inverse_text_normalization.es_en.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.es_en.verbalizers.verbalize_final import ( VerbalizeFinalFst, @@ -121,6 +121,11 @@ def __init__( from nemo_text_processing.inverse_text_normalization.hi.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) + elif lang == 'hi_en': # Hindi-English code-switch + from nemo_text_processing.inverse_text_normalization.hi_en.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.inverse_text_normalization.hi_en.verbalizers.verbalize_final import ( + VerbalizeFinalFst, + ) elif lang == 'hy': from nemo_text_processing.inverse_text_normalization.hy.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.hy.verbalizers.verbalize_final import ( @@ -198,6 +203,7 @@ def parse_args(): 'vi', 'ar', 'es_en', + 'hi_en', 'zh', 'he', 'hi', diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index c93d8df64..cf9b29fce 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -35,7 +35,25 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "ko", "mr", "pt", "ru", "sv", "vi", "zh", 'ja'], + choices=[ + "ar", + "de", + "en", + "es", + "es_en", + "fr", + "hi", + "hi_en", + "hy", + "ko", + "mr", + "pt", + "ru", + "sv", + "vi", + "zh", + 'ja', + ], default="en", type=str, ) diff --git a/tests/nemo_text_processing/hi_en/__init__.py b/tests/nemo_text_processing/hi_en/__init__.py new file mode 100644 index 000000000..4fc25d0d3 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_address.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_address.txt new file mode 100644 index 000000000..b4461477d --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_address.txt @@ -0,0 +1,41 @@ +दिल्ली एक एक शून्य शून्य शून्य एक~दिल्ली ११०००१ +मुंबई चार शून्य शून्य शून्य शून्य एक~मुंबई ४००००१ +चेन्नई छह शून्य शून्य शून्य शून्य एक~चेन्नई ६००००१ +कोलकाता सात शून्य शून्य शून्य शून्य एक~कोलकाता ७००००१ +बेंगलुरु पाँच छह शून्य शून्य शून्य एक~बेंगलुरु ५६०००१ +सात शून्य शून्य ओक स्ट्रीट~७०० ओक स्ट्रीट +एक एक जंगल रोड~११ जंगल रोड +तीन शून्य एक पार्क एवेन्यू~३०१ पार्क एवेन्यू +गली नंबर एक सात जीएकगढ़~गली नंबर १७ जीएकगढ़ +अदनान अपार्टमेंट फ्लैट नंबर पाँच पाँच~अदनान अपार्टमेंट फ्लैट नंबर ५५ +प्लॉट नंबर आठ बालाजी मार्केट~प्लॉट नंबर ८ बालाजी मार्केट +बूथ सात शून्य, सेक्टर आठ, चंडीगढ़~बूथ ७०, सेक्टर ८, चंडीगढ़ +दो दो दो एक सदर्न स्ट्रीट~२२२१ सदर्न स्ट्रीट +छह दो पाँच स्कूल स्ट्रीट~६२५ स्कूल स्ट्रीट +पाँच शून्य छह स्टेट रोड~५०६ स्टेट रोड +छह छह हाइफ़न चार, पार्कहर्स्ट रोड~६६-४, पार्कहर्स्ट रोड +एक चार बटा तीन, मथुरा रोड~१४/३, मथुरा रोड +अमरावती छह पाँच पाँच नौ तीन शून्य~अमरावती ६५५९३० +अमरावती चार छह आठ दो पाँच दो~अमरावती ४६८२५२ +शिमला, हिमाचल प्रदेश पाँच नौ तीन नौ आठ आठ~शिमला, हिमाचल प्रदेश ५९३९८८ +रांची, झारखंड सात तीन छह पाँच पाँच सात~रांची, झारखंड ७३६५५७ +कोहिमा, नागालैंड चार चार आठ तीन सात सात~कोहिमा, नागालैंड ४४८३७७ +मुंबई, महाराष्ट्र आठ तीन नौ चार आठ आठ~मुंबई, महाराष्ट्र ८३९४८८ +मुंबई, महाराष्ट्र दो नौ शून्य नौ तीन सात~मुंबई, महाराष्ट्र २९०९३७ +गांधीनगर, गुजरात आठ शून्य आठ तीन सात चार~गांधीनगर, गुजरात ८०८३७४ +रायपुर, छत्तीसगढ़ एक एक शून्य छह तीन पाँच~रायपुर, छत्तीसगढ़ ११०६३५ +भोपाल, मध्य प्रदेश सात पाँच एक दो दो पाँच~भोपाल, मध्य प्रदेश ७५१२२५ +अगरतला, त्रिपुरा नौ एक पाँच तीन शून्य पाँच~अगरतला, त्रिपुरा ९१५३०५ +लखनऊ, उत्तर प्रदेश आठ शून्य दो चार आठ एक~लखनऊ, उत्तर प्रदेश ८०२४८१ +श्रीनगर, जम्मू और कश्मीर नौ छह चार पाँच दो तीन~श्रीनगर, जम्मू और कश्मीर ९६४५२३ +seven hundred oak street~700 oak street +eleven hundred park avenue~1100 park avenue +three hundred one main street~301 main street +two hundred and fifty four san tomas avenue~254 san tomas avenue +fourteen hundred twenty eight elm street~1428 elm street +twelve hundred and eleven east avenue~1211 east avenue +one thousand one hundred laguna court~1100 laguna court +two hundred and fifty four arques avenue~254 arques avenue +मुंबई one one zero zero zero one~मुंबई ११०००१ +चेन्नई six zero zero zero zero one~चेन्नई ६००००१ +दिल्ली one one zero zero zero one~दिल्ली ११०००१ diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..4b6011eb5 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,61 @@ +शून्य~० +एक~१ +दो~२ +तीन~३ +दस~१० +ग्यारह~११ +बारह~१२ +तेरह~१३ +चौदह~१४ +पन्द्रह~१५ +सोलह~१६ +बीस~२० +तेईस~२३ +पचास~५० +सत्तर~७० +नब्बे~९० +सौ~१०० +दो सौ~२०० +एक सौ दस~११० +तीन सौ पचास~३५० +हज़ार~१००० +एक हज़ार~१००० +दो हज़ार~२००० +दस हज़ार~१०००० +एक लाख~१००००० +दो लाख~२००००० +एक करोड़~१००००००० +पाँच करोड़~५००००००० +साढ़े तीन सौ~३५० +सवा दो सौ~२२५ +डेढ़ सौ~१५० +ढाई सौ~२५० +साढ़े तीन हज़ार~३५०० +सवा दो हज़ार~२२५० +ढाई हज़ार~२५०० +पौने चार सौ~३७५ +पौने दो सौ~१७५ +zero~zero +sixty~60 +nineteen~19 +two hundred and fifty four~254 +one hundred forty seven thousand four hundred fifty one~147451 +one million one hundred fifty six thousand one hundred seventy three~1156173 +one billion five hundred ninety three million seventy two thousand nine hundred sixty one~1593072961 +minus twenty five thousand thirty seven~-25037 +minus sixty~-60 +forty six thousand six hundred sixty four~46664 +two million three~2000003 +one thousand thirteen~1013 +one thousand one~1001 +one thousand one hundred~1100 +one thousand twenty six~1026 +one thousand one hundred twenty six~1126 +eleven hundred~1100 +twenty one hundred~2100 +twenty one hundred and eleven~2111 +eleven hundred twenty one~1121 +twenty one crore ninety eight lakh thirty six thousand five hundred and ninety three~219836593 +दस and twenty~१० and 20 +सौ and one hundred~१०० and 100 +two hundred और तीन सौ~200 और ३०० diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..6f5db9407 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_date.txt @@ -0,0 +1,37 @@ +पाँच जनवरी~५ जनवरी +दस फ़रवरी~१० फ़रवरी +पन्द्रह मार्च~१५ मार्च +बीस अप्रैल~२० अप्रैल +तीस जून~३० जून +पाँच जनवरी दो हज़ार बारह~५ जनवरी, २०१२ +दस फ़रवरी उन्नीस सौ नब्बे~१० फ़रवरी, १९९० +दो हज़ार बारह~२०१२ +उन्नीस सौ सत्तर~१९७० +दो हज़ार~२००० +उन्नीस सौ~१९०० +दो हज़ार बारह से दो हज़ार पन्द्रह~२०१२-२०१५ +पहली सदी~पहली सदी +बीसवीं सदी~२०वीं सदी +दो सौ तीन ईसा पूर्व~२०३ ई.पू. +चार सौ बीस ईसवी~४२० ई. +पन्द्रह सौ ईसवी~१५०० ई. +दो हज़ार बीस ईसवी~२०२० ई. +पन्द्रह अगस्त उन्नीस सौ सैंतालीस~१५ अगस्त, १९४७ +छब्बीस जनवरी उन्नीस सौ पचास~२६ जनवरी, १९५० +january first~january 1 +july twenty second two thousand eight~july 22, 2008 +june thirty~june 30 +july twenty fifth twenty twelve~july 25, 2012 +nineteen seventeen~1917 +twenty twelve~2012 +nineteen ninety four~1994 +two thousand three~2003 +the twenty fifth of july twenty twelve~25 july, 2012 +the fifteenth of january~15 january +february twenty fifth twenty sixteen~february 25, 2016 +november twenty fourth twenty fourteen~november 24, 2014 +january पच्चीस~january २५ +पाँच january~५ january +दस jan~१० jan +march बीस~march २० +दस march twenty twenty~१० march 2020 diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_date_cased.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_date_cased.txt new file mode 100644 index 000000000..96bbc7d32 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_date_cased.txt @@ -0,0 +1,70 @@ +july twenty fifth two thousand twelve~july 25, 2012 +nineteen eighties~1980s +two thousand and twenty~2020 +two thousand and nine~2009 +the twenty fifth of july twenty twelve~25 july, 2012 +the twenty fifth of july two thousand twelve~25 july, 2012 +the twenty second of july twenty twelve~22 july, 2012 +the fifteenth of january~15 january +the seventeenth of may twenty ten~17 may, 2010 +january first~january 1 +july twenty second two thousand eight~july 22, 2008 +june thirty~june 30 +july twenty fifth twenty twelve~july 25, 2012 +nineteen seventeen~1917 +twenty twelve~2012 +march sixteen sixty five~march 1665 +sixteen sixty five~1665 +july two thousand twelve~july 2012 +october nineteen oh five~october 1905 +july fifteen o six~july 1506 +the twenty fifth of july twenty twelve~25 july, 2012 +july twenty fifth twenty twelve~july 25, 2012 +july twenty fifth two thousand twelve~july 25, 2012 +july one thousand eight hundred seventy six~july 1876 +february twenty fifth twenty sixteen~february 25, 2016 +november twenty fourth twenty fourteen~november 24, 2014 +nineteen ninety four~1994 +two thousand three~2003 +one thousand eight~1008 +nineteen seventy six~1976 +june twentieth twenty fourteen~june 20, 2014 +nineteen seventy three~1973 +nineteen seventy five~1975 +eleven fifty five~1155 +July twenty fifth two thousand twelve~July 25, 2012 +Nineteen eighties~1980s +Two thousand and twenty~2020 +Two thousand and nine~2009 +The twenty fifth of july twenty twelve~25 july, 2012 +The twenty fifth of july two thousand twelve~25 july, 2012 +The twenty second of july twenty twelve~22 july, 2012 +The fifteenth of january~15 january +The fifteenth of January~15 January +The seventeenth of may twenty ten~17 may, 2010 +January first~January 1 +July twenty second two thousand eight~July 22, 2008 +June thirty~June 30 +July twenty fifth twenty twelve~July 25, 2012 +Nineteen seventeen~1917 +Twenty twelve~2012 +March sixteen sixty five~March 1665 +Sixteen sixty five~1665 +July two thousand twelve~July 2012 +October nineteen oh five~October 1905 +July fifteen o six~July 1506 +The twenty fifth of july twenty twelve~25 july, 2012 +The twenty fifth of July twenty twelve~25 July, 2012 +July twenty fifth twenty twelve~July 25, 2012 +July twenty fifth two thousand twelve~July 25, 2012 +July one thousand eight hundred seventy six~July 1876 +February twenty fifth twenty sixteen~February 25, 2016 +November twenty fourth twenty fourteen~November 24, 2014 +Nineteen ninety four~1994 +Two thousand three~2003 +One thousand eight~1008 +Nineteen seventy six~1976 +June twentieth twenty fourteen~June 20, 2014 +Nineteen seventy three~1973 +Nineteen seventy five~1975 +Eleven fifty five~1155 diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_decimal.txt new file mode 100644 index 000000000..075aebab7 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_decimal.txt @@ -0,0 +1,25 @@ +एक दशमलव दो छह~१.२६ +शून्य दशमलव पाँच~०.५ +ऋण एक दशमलव दो छह~-१.२६ +दो दशमलव तीन चार~२.३४ +पाँच दशमलव शून्य एक~५.०१ +five point two million~5.2 million +one hundred sixty four point five eight thousand~164.58 thousand +four hundred million~400 million +fifty billion~50 billion +one point two five billion~1.25 billion +thirteen billion~13 billion +thirty billion~30 billion +two thousand eight hundred five point eight seven three billion~2805.873 billion +eighteen~18 +eighteen point eight five~18.85 +eighteen point five o~18.50 +eighteen point five six~18.56 +eighteen point nine~18.9 +eighteen point o five~18.05 +eighteen point one two~18.12 +eighteen point o one~18.01 +zero point two six~0.26 +point zero two~.02 +sixty point two~60.2 +minus sixty point two four zero zero~-60.2400 diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_electronic.txt new file mode 100644 index 000000000..de609c263 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_electronic.txt @@ -0,0 +1,17 @@ +a dot b c at g mail dot com~a.bc@gmail.com +a at gmail dot com~a@gmail.com +c d f at a b c dot e d u~cdf@abc.edu +a b c at g mail dot a b c~abc@gmail.abc +a b c at a b c dot com~abc@abc.com +a s d f one two three at a b c dot com~asdf123@abc.com +a one b two at a b c dot com~a1b2@abc.com +a b three dot s d d dot three at g mail dot com~ab3.sdd.3@gmail.com +h t t p colon slash slash w w w dot o u r d a i l y n e w s dot com dot s m~http://www.ourdailynews.com.sm +w w w dot c o m d a i l y n e w s dot a b slash s m~www.comdailynews.ab/sm +n vidia dot com~nvidia.com +abc at gmail dot com~abc@gmail.com +athreed at gmail dot com~athreed@gmail.com +kore dot ai~kore.ai +a at nvidia dot com~a@nvidia.com +a dot b c at nvidia dot com~a.bc@nvidia.com +a b three hyphen s d d dash three at g mail dot com~ab3-sdd-3@gmail.com diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_fraction.txt new file mode 100644 index 000000000..ce534bc21 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_fraction.txt @@ -0,0 +1,39 @@ +आठ बटा तीन~८/३ +दो बटा पाँच~२/५ +एक बटा चार~१/४ +दो सही दो बटा तीन~२ २/३ +तीन सही एक बटा चार~३ १/४ +पाँच सही तीन बटा सात~५ ३/७ +सात बटे ग्यारह~७/११ +छह बटा तेरह~६/१३ +डेढ़~१ १/२ +ढाई~२ १/२ +पाव~१/४ +एक सौ नौ बटा एक सौ चौबीस~१०९/१२४ +एक सौ एक बटा दो~१०१/२ +दो सौ एक बटा दो~२०१/२ +एक सौ एक बटा चार~१०१/४ +दो सौ बटा पाँच सौ~२००/५०० +दो सौ बटा बारह~२००/१२ +एक सौ तेईस बटा एक सौ पच्चीस~१२३/१२५ +छह सौ बासठ बटा एक~६६२/१ +एक सौ पाँच बटा सात~१०५/७ +छह सौ चौवन बटा तीन~६५४/३ +एक सौ तैंतीस सही एक बटा दो~१३३ १/२ +एक सौ तैंतीस सही दो बटा तीन~१३३ २/३ +एक सही छह बटा छह~१ ६/६ +दो सही एक बटा छह~२ १/६ +तीन सही तीन बटा चार~३ ३/४ +एक सौ बीस सही तीन बटा चार~१२० ३/४ +एक सौ बीस सही पिछत्तर बटा नब्बे~१२० ७५/९० +सवा चौरासी~८४ १/४ +आधा~१/२ +साढ़े~१/२ +सवा~१/४ +पौन~३/४ +पौना~३/४ +सवा पैंतीस~३५ १/४ +साढ़े चार सौ बटा दस~४५०/१० +तीन चौथाई~३/४ +दो तिहाई~२/३ +एक चौथाई~१/४ diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_measure.txt new file mode 100644 index 000000000..d232013ae --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_measure.txt @@ -0,0 +1,25 @@ +दो सौ मीटर~२०० m +तीन किलोग्राम~३ kg +साढ़े तीन किलोग्राम~३.५ kg +ऋण बारह किलोग्राम~-१२ kg +पचास किलोमीटर~५० km +तीन सौ ग्राम~३०० g +सवा दो किलोग्राम~२.२५ kg +दो सौ किलोमीटर प्रति घंटा~२०० km/h +बीस डिग्री सेल्सियस~२० °C +एक सौ मीटर~१०० m +two hundred meters~200 m +three hours~3 h +two hundred kilometers per hour~200 km/h +minus sixty six kilograms~-66 kg +eight point five megawatts~8.5 mW +eight point five meters~8.5 m +eight point five two percent~8.52 % +eight point four four percent~8.44 % +five degrees celsius~5 °C +seventy two degrees fahrenheit~72 °F +two hundred seventy three kelvin~273 K +eighteen feet~18 ft +eighteen point five kilometers~18.5 km +eight hundred fifty meters~850 m +eight hundred kilograms~800 kg diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_money.txt new file mode 100644 index 000000000..aa9107892 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_money.txt @@ -0,0 +1,41 @@ +दस रुपये~₹१० +सौ रुपये~₹१०० +हज़ार रुपये~₹१००० +दो सौ रुपये~₹२०० +पाँच सौ रुपये~₹५०० +दो हज़ार रुपये~₹२००० +एक लाख रुपये~₹१००००० +दस रुपये और पचास पैसे~₹१०.५० +बीस डॉलर~$२० +पचास यूरो~€५० +बहत्तर बिटकॉइन~₿७२ +ढाई सौ रुपये~₹२५० +साढ़े तीन सौ रुपये~₹३५० +सवा दो सौ रुपये~₹२२५ +पौने चार सौ रुपये~₹३७५ +ढाई करोड़ रुपये~₹२५०००००० +साढ़े तीन लाख रुपये~₹३५०००० +सवा दो लाख रुपये~₹२२५००० +पचास हज़ार रुपये~₹५०००० +two dollars~$2 +one cent~$0.01 +four united states dollars and sixty nine cents~$4.69 +seventy five dollars sixty three~$75.63 +twenty nine dollars fifty cents~$29.50 +eleven dollars and fifty one cents~$11.51 +nine hundred ninety three dollars and ninety two cents~$993.92 +four hundred sixty billion won~₩460 billion +thirty billion yen~¥30 billion +two point five billion dollars~$2.5 billion +forty five billion dollars~$45 billion +fifty million dollars~$50 million +one dollar~$1 +fifteen thousand dollars~$15000 +twenty dollar~$20 +eighteen dollars~$18 +fifteen hundred dollars~$1500 +one hundred रुपये~100 रुपये +fifty रुपये~50 रुपये +सौ डॉलर~$१०० +twenty dollars और fifty cents~$20 और $0.50 +दस rupees~१० rupees diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_ordinal.txt new file mode 100644 index 000000000..94910d49c --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_ordinal.txt @@ -0,0 +1,26 @@ +पहला~पहला +दूसरा~दूसरा +तीसरा~तीसरा +चौथा~चौथा +दसवां~१०वां +ग्यारहवां~११वां +बीसवां~२०वां +तेईसवां~२३वां +पचासवां~५०वां +नब्बेवाँ~९०वाँ +दसवीं~१०वीं +बीसवीं~२०वीं +first~1st +second~2nd +third~3rd +fourth~4th +eleventh~11th +twelfth~12th +thirteenth~13th +twenty first~21st +twenty third~23rd +one hundredth~100th +one hundred eleventh~111th +one thousandth~1000th +forty second~42nd +seventy first~71st diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_telephone.txt new file mode 100644 index 000000000..e318fba4e --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_telephone.txt @@ -0,0 +1,39 @@ +एक एक एक एक एक एक~११११११ +पाँच शून्य शून्य शून्य एक दो~५०००१२ +एक दो तीन चार पाँच छह~१२३४५६ +चार शून्य शून्य शून्य एक शून्य~४०००१० +सात पाँच शून्य शून्य शून्य दो~७५०००२ +आठ आठ शून्य नौ नौ शून्य~८८०९९० +नौ आठ सात छह पाँच चार तीन दो एक शून्य~९८७६५४३२१० +सात शून्य एक दो तीन चार पाँच छह सात आठ~७०१२३४५६७८ +आठ आठ आठ सात सात सात छह छह छह छह~८८८७७७६६६६ +छह दो नौ शून्य एक पाँच सात तीन चार आठ~६२९०१५७३४८ +नौ नौ आठ आठ सात सात छह छह पाँच पाँच~९९८८७७६६५५ +प्लस नौ एक नौ आठ सात छह पाँच चार तीन दो एक शून्य~+९१ ९८७६५४३२१० +प्लस नौ एक सात शून्य एक दो तीन चार पाँच छह सात आठ~+९१ ७०१२३४५६७८ +प्लस नौ एक आठ आठ आठ सात सात सात छह छह छह छह~+९१ ८८८७७७६६६६ +प्लस नौ एक एक एक एक एक एक एक एक एक एक एक~+९१ ११११११११११ +शून्य दो शून्य दो चार तीन सात एक पाँच चार दो~०२०२४३७१५४२ +शून्य एक एक दो छह एक दो तीन चार पाँच छह~०११२६१२३४५६ +चार चार दो दो आठ आठ छह छह चार चार~४४२२८८६६४४ +शून्य आठ शून्य चार एक दो तीन चार पाँच छह सात~०८०४१२३४५६७ +दो दो छह छह पांच चार तीन दो एक शून्य~२२६६५४३२१० +पाँच शून्य शून्य नौ~५००९ +एक शून्य दो शून्य~१०२० +one two three one two three five six seven eight~123-123-5678 +plus nine one one two three one two three five six seven eight~+91 123-123-5678 +plus forty four one two three one two three five six seven eight~+44 123-123-5678 +o two three one two three five six seven eight~023-123-5678 +oh two three one two three five six seven eight~023-123-5678 +double oh three one two three five six seven eight~003-123-5678 +two two five dot double five dot o dot four o~225.55.0.40 +two two five dot double five dot o dot forty five~225.55.0.45 +ssn is seven double nine one two three double one three~ssn is 799-12-3113 +seven nine nine~799 +a b nine~ab9 +a b c~a b c +five w k r a three one~5wkra31 +नंबर nine eight seven six five four three two one zero~नंबर 987-654-3210 +मोबाइल one two three four five six seven eight nine zero~मोबाइल 123-456-7890 +मोबाइल nine eight seven six five four three two one zero~मोबाइल 987-654-3210 +phone एक एक एक एक एक एक एक एक एक एक~phone ११११११११११ diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..134b699b4 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_time.txt @@ -0,0 +1,29 @@ +एक बजे~१:०० +दो बजे~२:०० +दस बजे~१०:०० +बारह बजे~१२:०० +एक बजके सात मिनट~१:०७ +चार बजे चवालीस मिनट~४:४४ +दस बजके तीस मिनट~१०:३० +तीन बजके पन्द्रह मिनट~३:१५ +एक बजके दस मिनट दो सेकंड~१:१०:०२ +साढ़े तीन बजे~३:३० +सवा चार बजे~४:१५ +पौने पाँच बजे~४:४५ +डेढ़ बजे~१:३० +ढाई बजे~२:३० +eight oclock g m t~08:00 gmt +seven a m e s t~07:00 a.m. est +two p m~02:00 p.m. +two thirty~02:30 +three o'clock~03:00 +quarter past one~01:15 +half past three~03:30 +eight fifty one~08:51 +eight forty~08:40 +eleven fifty five p m~11:55 p.m. +eleven forty five a m~11:45 a.m. +eleven forty six a m~11:46 a.m. +quarter to twelve~11:45 +set alarm at ten to eleven pm~set alarm at 10:50 p.m. +one min to one am~12:59 a.m. diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..2b22a8a78 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,17 @@ +डॉक्टर~डॉ. +श्रीमती~स्मि. +श्री~श्री. +श्रीमान~श्री. +कुमारी~कु. +मास्टर~मा. +पाव~१/४ +doctor dao~dr. dao +misses smith~mrs. smith +mister dao~mr. dao +saint george~st. george +i like for example ice cream~i like e.g. ice cream +s and p five hundred~S&P 500 +seven eleven stores~7-eleven stores +r t x~RTX +nvidia a one hundred~Nvidia A100 +c u d n n~cuDNN diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_word.txt new file mode 100644 index 000000000..14977faec --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_word.txt @@ -0,0 +1,19 @@ +~ +yahoo!~yahoo! +twenty!~20 ! +x ~x +—~— +aaa~aaa +aabach~aabach +aabenraa~aabenraa +aachen's~aachen's +aadri~aadri +aahar~aahar +aahh~aahh +~ +, one~, one +, one , two , three , four~, one , two , three , four +e s three~es3 +नमस्ते~नमस्ते +भारत~भारत +दुनिया~दुनिया diff --git a/tests/nemo_text_processing/hi_en/test_address.py b/tests/nemo_text_processing/hi_en/test_address.py new file mode 100644 index 000000000..4f7dc3c51 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_address.py @@ -0,0 +1,38 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestAddress: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_address.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_address.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi_native(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi_en/test_cardinal.py b/tests/nemo_text_processing/hi_en/test_cardinal.py new file mode 100644 index 000000000..05180a19a --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_cardinal.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_cardinal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi_native(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_date.py b/tests/nemo_text_processing/hi_en/test_date.py new file mode 100644 index 000000000..44a1203e4 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_date.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDate: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_date_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi_native(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_decimal.py b/tests/nemo_text_processing/hi_en/test_decimal.py new file mode 100644 index 000000000..51e42e7c2 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_decimal.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDecimal: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_decimal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi_native(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_electronic.py b/tests/nemo_text_processing/hi_en/test_electronic.py new file mode 100644 index 000000000..e57c846de --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_electronic.py @@ -0,0 +1,41 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestElectronic: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_electronic.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_electronic_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_fraction.py b/tests/nemo_text_processing/hi_en/test_fraction.py new file mode 100644 index 000000000..db83bd32a --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_fraction.py @@ -0,0 +1,38 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestFraction: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_fraction.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_fraction.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi_native(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_measure.py b/tests/nemo_text_processing/hi_en/test_measure.py new file mode 100644 index 000000000..7298e5187 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_measure.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestMeasure: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_measure.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_measure_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_measure.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi_native(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_money.py b/tests/nemo_text_processing/hi_en/test_money.py new file mode 100644 index 000000000..f61aea48c --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_money.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestMoney: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_money.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_money_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_money.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi_native(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_ordinal.py b/tests/nemo_text_processing/hi_en/test_ordinal.py new file mode 100644 index 000000000..4af306f52 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_ordinal.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestOrdinal: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_ordinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_ordinal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_ordinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi_native(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/hi_en/test_sparrowhawk_inverse_text_normalization.sh new file mode 100644 index 000000000..0a805bbed --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_sparrowhawk_inverse_text_normalization.sh @@ -0,0 +1,102 @@ +#! /bin/sh +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read spoken written <<< $testcase + denorm_pred=$(echo $spoken | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + + # trim white space + written="$(echo -e "${written}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$spoken" "$written" "$denorm_pred" + done < "$input" +} + +testITNCardinal() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testITNDate() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_date.txt + runtest $input +} + +testITNDecimal() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_decimal.txt + runtest $input +} + +testITNOrdinal() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_ordinal.txt + runtest $input +} + +testITNTime() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_time.txt + runtest $input +} + +testITNMeasure() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_measure.txt + runtest $input +} + +testITNMoney() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_money.txt + runtest $input +} + +testITNWhitelist() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_whitelist.txt + runtest $input +} + +testITNTelephone() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_telephone.txt + runtest $input +} + +testITNAddress() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_address.txt + runtest $input +} + +testITNFraction() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_fraction.txt + runtest $input +} + +testITNElectronic() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_electronic.txt + runtest $input +} + +# Remove all command-line arguments +shift $# + +# Load shUnit2 +. /workspace/shunit2/shunit2 diff --git a/tests/nemo_text_processing/hi_en/test_telephone.py b/tests/nemo_text_processing/hi_en/test_telephone.py new file mode 100644 index 000000000..4c3d23640 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_telephone.py @@ -0,0 +1,41 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTelephone: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_telephone.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_telephone_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_time.py b/tests/nemo_text_processing/hi_en/test_time.py new file mode 100644 index 000000000..2f872232f --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_time.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTime: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_time_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi_native(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_whitelist.py b/tests/nemo_text_processing/hi_en/test_whitelist.py new file mode 100644 index 000000000..fc4b17796 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_whitelist.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWhitelist: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_whitelist_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi_native(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_word.py b/tests/nemo_text_processing/hi_en/test_word.py new file mode 100644 index 000000000..345d4bf82 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_word.py @@ -0,0 +1,48 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWord: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_word_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi_native(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index d6dfebdde..03705f2b6 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -103,6 +103,7 @@ def parse_args(): 'es_en', 'he', 'hi', + 'hi_en', 'hy', 'mr', 'ja', @@ -139,7 +140,7 @@ def parse_args(): if __name__ == '__main__': args = parse_args() - if args.language in ['ru', 'es_en', 'mr'] and args.grammars == 'tn_grammars': + if args.language in ['ru', 'es_en', 'hi_en', 'mr'] and args.grammars == 'tn_grammars': raise ValueError('Only ITN grammars could be deployed in Sparrowhawk for the selected languages.') TNPostProcessingFst = None ITNPostProcessingFst = None @@ -289,6 +290,13 @@ def parse_args(): from nemo_text_processing.inverse_text_normalization.es_en.verbalizers.verbalize import ( VerbalizeFst as ITNVerbalizeFst, ) + elif args.language == 'hi_en': + from nemo_text_processing.inverse_text_normalization.hi_en.taggers.tokenize_and_classify import ( + ClassifyFst as ITNClassifyFst, + ) + from nemo_text_processing.inverse_text_normalization.hi_en.verbalizers.verbalize import ( + VerbalizeFst as ITNVerbalizeFst, + ) elif args.language == 'mr': from nemo_text_processing.inverse_text_normalization.mr.taggers.tokenize_and_classify import ( ClassifyFst as ITNClassifyFst, From c8ad73ea10b2d225a967a339cb9900603ed5d490 Mon Sep 17 00:00:00 2001 From: Mariana <47233618+mgrafu@users.noreply.github.com> Date: Fri, 12 Jun 2026 06:18:29 -0400 Subject: [PATCH 5/6] ar money bugfix and sh tests (#438) * ar money bugfix and sh tests Signed-off-by: Mariana Graterol Fuenmayor * address review Signed-off-by: Mariana Graterol Fuenmayor --------- Signed-off-by: Mariana Graterol Fuenmayor --- Jenkinsfile | 2 +- .../text_normalization/ar/taggers/money.py | 29 ++++---- .../ar/verbalizers/money.py | 10 +-- .../ar/test_sparrowhawk_normalization.sh | 71 +++++++++++++++++++ .../pynini_export.py | 1 + 5 files changed, 91 insertions(+), 22 deletions(-) create mode 100755 tests/nemo_text_processing/ar/test_sparrowhawk_normalization.sh diff --git a/Jenkinsfile b/Jenkinsfile index 4ac66d02f..985363986 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -10,7 +10,7 @@ pipeline { disableConcurrentBuilds(abortPrevious: true) } environment { - AR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-24-24-0' + AR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-11-26-0' DE_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-23-24-0' EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-26-3' ES_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-25-24-0' diff --git a/nemo_text_processing/text_normalization/ar/taggers/money.py b/nemo_text_processing/text_normalization/ar/taggers/money.py index 925fa348e..b809354e4 100644 --- a/nemo_text_processing/text_normalization/ar/taggers/money.py +++ b/nemo_text_processing/text_normalization/ar/taggers/money.py @@ -80,14 +80,14 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"") ) - graph_integer_only = graph_maj_singular + insert_space + graph_integer_one - graph_integer_only |= graph_maj_plural + insert_space + graph_integer + currency_first = pynutil.insert(' morphosyntactic_features: "currency_first"') + # Currency-first tagging for exactly one major unit (e.g. $1 -> دولار واحد). + graph_integer_one_unit = graph_maj_singular + insert_space + graph_integer_one + currency_first # For local currency "9د.ك" graph_integer_only_ar = graph_integer + insert_space + graph_ar_cur - # graph_decimal_ar = graph_decimal_final + insert_space + graph_ar_cur - graph = (graph_integer_only + optional_delete_fractional_zeros) | graph_integer_only_ar + graph = (graph_integer_one_unit + optional_delete_fractional_zeros) | graph_integer_only_ar # remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits # e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10 @@ -112,9 +112,12 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): preserve_order = pynutil.insert(" preserve_order: true") integer_plus_maj = graph_integer + insert_space + pynutil.insert(curr_symbol) @ graph_maj_plural - integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular - # non zero integer part - integer_plus_maj = (pynini.closure(NEMO_DIGIT) - "0") @ integer_plus_maj + integer_plus_maj_with_one = integer_plus_maj | ( + graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular + ) + # Amount == 1 without fractional part uses graph_integer_one_unit / graph_one_prefix. + integer_plus_maj_no_minor = (pynini.closure(NEMO_DIGIT) - "0") @ integer_plus_maj + integer_plus_maj_with_minor = (pynini.closure(NEMO_DIGIT) - "0") @ integer_plus_maj_with_one graph_fractional_one = two_digits_fractional_part @ pynini.cross("1", "") graph_fractional_one = pynutil.insert("fractional_part: \"") + graph_fractional_one + pynutil.insert("\"") @@ -141,22 +144,16 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): graph_fractional_up_to_ten + insert_space + pynutil.insert(curr_symbol) @ graph_min_plural ) - graph_with_no_minor_curr = integer_plus_maj - graph_with_no_minor_curr |= pynutil.add_weight( - integer_plus_maj, - weight=0.0001, - ) - - graph_with_no_minor_curr = pynutil.delete(curr_symbol) + graph_with_no_minor_curr + preserve_order + graph_with_no_minor_curr = pynutil.delete(curr_symbol) + integer_plus_maj_no_minor + preserve_order graph_with_no_minor = ( graph_with_no_minor_curr if graph_with_no_minor is None else pynini.union(graph_with_no_minor, graph_with_no_minor_curr) ) - decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross(".", " ") + fractional_plus_min + decimal_graph_with_minor_curr = integer_plus_maj_with_minor + pynini.cross(".", " ") + fractional_plus_min decimal_graph_with_minor_curr |= pynutil.add_weight( - integer_plus_maj + integer_plus_maj_with_minor + pynini.cross(".", " ") + pynutil.insert("fractional_part: \"") + two_digits_fractional_part @ cardinal_graph diff --git a/nemo_text_processing/text_normalization/ar/verbalizers/money.py b/nemo_text_processing/text_normalization/ar/verbalizers/money.py index 46da10742..9f5041b13 100644 --- a/nemo_text_processing/text_normalization/ar/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/ar/verbalizers/money.py @@ -28,6 +28,7 @@ class MoneyFst(GraphFst): Finite state transducer for verbalizing money, e.g. money { integer_part: "تسعة" currency_maj: "يورو" preserve_order: true} -> "تسعة يورو" money { integer_part: "تسعة" currency_maj: "دولار" preserve_order: true} -> "تسعة دولار" + money { currency_maj: "دولار" integer_part: "واحد" morphosyntactic_features: "currency_first"} -> "دولار واحد" money { integer_part: "خمسة" currency_maj: "دينار كويتي"} -> "خمسة دينار كويتي" Args: @@ -49,9 +50,10 @@ def __init__(self, deterministic: bool = True): integer_part = pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") add_and = pynutil.insert(" و") + morph_currency_first = pynutil.delete(' morphosyntactic_features: "currency_first"') - # *** currency_maj - graph_integer = maj + keep_space + integer_part + # currency_maj before integer_part; disambiguated via morphosyntactic_features for Sparrowhawk. + graph_currency_first = maj + keep_space + integer_part + delete_space + morph_currency_first # *** currency_maj + (***) (و) *** current_min graph_integer_with_minor = ( @@ -65,12 +67,10 @@ def __init__(self, deterministic: bool = True): + pynini.closure(keep_space + min, 0, 1) + delete_preserve_order ) - # this graph fix word order from dollar three (دولار تسعة)--> three dollar (تسعة دولار) graph_integer_no_minor = integer_part + keep_space + maj + delete_space + delete_preserve_order - # *** current_min graph_minor = fractional_part + keep_space + delete_space + min + delete_preserve_order - graph = graph_integer | graph_integer_with_minor | graph_minor | graph_integer_no_minor + graph = graph_currency_first | graph_integer_with_minor | graph_minor | graph_integer_no_minor delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/tests/nemo_text_processing/ar/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/ar/test_sparrowhawk_normalization.sh new file mode 100755 index 000000000..6998a6fbc --- /dev/null +++ b/tests/nemo_text_processing/ar/test_sparrowhawk_normalization.sh @@ -0,0 +1,71 @@ +#! /bin/sh +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +TEST_DIR=${2:-"/workspace/tests/ar"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + while IFS= read -r testcase; do + IFS='~' read -r written spoken <<< "$testcase" + + escaped_written=$(printf '%s' "$written" | sed 's/\\/\\\\/g') + denorm_pred=$(echo "$escaped_written" | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + + spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + assertEquals "$written" "$spoken" "$denorm_pred" + done < "$input" +} + +# For test files stored as expected~input (spoken~written). +runtest_swapped () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + while IFS= read -r testcase; do + IFS='~' read -r spoken written <<< "$testcase" + + escaped_written=$(printf '%s' "$written" | sed 's/\\/\\\\/g') + denorm_pred=$(echo "$escaped_written" | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + + spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + assertEquals "$written" "$spoken" "$denorm_pred" + done < "$input" +} + +testTNCardinal() { + input=$TEST_DIR/data_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testTNDecimal() { + input=$TEST_DIR/data_text_normalization/test_cases_decimal.txt + runtest $input +} + +testTNFraction() { + input=$TEST_DIR/data_text_normalization/test_cases_fraction.txt + runtest_swapped $input +} + +testTNMeasure() { + input=$TEST_DIR/data_text_normalization/test_cases_measure.txt + runtest_swapped $input +} + +testTNMoney() { + input=$TEST_DIR/data_text_normalization/test_cases_money.txt + runtest $input +} + +# Remove all command-line arguments +shift $# + +# Load shUnit2 +. /workspace/shunit2/shunit2 diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 03705f2b6..73a4fc138 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -278,6 +278,7 @@ def parse_args(): from nemo_text_processing.text_normalization.ar.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, ) + from nemo_text_processing.text_normalization.ar.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst elif args.language == 'it': from nemo_text_processing.text_normalization.it.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, From 1f1263579fe57ba7ed783cad3dddee710fcc5064 Mon Sep 17 00:00:00 2001 From: Mariana <47233618+mgrafu@users.noreply.github.com> Date: Fri, 12 Jun 2026 06:19:38 -0400 Subject: [PATCH 6/6] change normalization of alphanum terms (#439) * change normalization of alphanum terms Signed-off-by: Mariana Graterol Fuenmayor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * address review Signed-off-by: Mariana Graterol Fuenmayor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Mariana Graterol Fuenmayor Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- Jenkinsfile | 2 +- .../text_normalization/en/taggers/serial.py | 157 ++++++++++++------ .../test_cases_ordinal.txt | 2 +- .../test_cases_serial.txt | 2 + 4 files changed, 109 insertions(+), 54 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 985363986..38e05bb00 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -12,7 +12,7 @@ pipeline { environment { AR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-11-26-0' DE_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-23-24-0' - EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-26-3' + EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-11-26-1' ES_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-25-24-0' ES_EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/08-30-24-0' HI_EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-26-4' diff --git a/nemo_text_processing/text_normalization/en/taggers/serial.py b/nemo_text_processing/text_normalization/en/taggers/serial.py index f650c8ff3..4a5d6bb9d 100644 --- a/nemo_text_processing/text_normalization/en/taggers/serial.py +++ b/nemo_text_processing/text_normalization/en/taggers/serial.py @@ -18,6 +18,8 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.en.graph_utils import ( + MIN_NEG_WEIGHT, + MIN_POS_WEIGHT, NEMO_ALPHA, NEMO_DIGIT, NEMO_NOT_SPACE, @@ -28,16 +30,65 @@ from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels +def _leading_zero_graph(cardinal: GraphFst) -> "pynini.FstLike": + return pynini.compose(pynini.accep("0") + pynini.closure(NEMO_DIGIT), cardinal.single_digits_graph).optimize() + + +def _build_serial_graph( + num_graph: "pynini.FstLike", + delimiter: "pynini.FstLike", + alphas: "pynini.FstLike", + ordinal: GraphFst, +) -> "pynini.FstLike": + letter_num = alphas + delimiter + num_graph + num_letter = pynini.closure(num_graph + delimiter, 1) + alphas + next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph)) + next_alpha_or_num |= pynini.closure( + delimiter + + num_graph + + plurals._priority_union(pynini.accep(" "), pynutil.insert(" "), NEMO_SIGMA).optimize() + + alphas + ) + + serial_graph = letter_num + next_alpha_or_num + serial_graph |= num_letter + next_alpha_or_num + serial_graph |= num_graph + delimiter + num_graph + delimiter + num_graph + pynini.closure(delimiter + num_graph) + + symbols = [x[0] for x in load_labels(get_abs_path("data/whitelist/symbol.tsv"))] + symbols = pynini.union(*symbols) + serial_graph |= pynini.compose(NEMO_SIGMA + symbols + NEMO_SIGMA, num_graph + delimiter + num_graph) + + serial_graph = pynini.compose( + pynini.difference(NEMO_SIGMA, pynini.project(ordinal.graph, "input")), serial_graph + ).optimize() + + serial_graph = pynutil.add_weight(serial_graph, MIN_POS_WEIGHT) + serial_graph |= ( + pynini.closure(NEMO_NOT_SPACE, 1) + (pynini.cross("^2", " squared") | pynini.cross("^3", " cubed")).optimize() + ) + + serial_graph = ( + pynini.closure((serial_graph | num_graph | alphas) + delimiter) + + serial_graph + + pynini.closure(delimiter + (serial_graph | num_graph | alphas)) + ) + return serial_graph.optimize() + + class SerialFst(GraphFst): """ - This class is a composite class of two other class instances + Finite state transducer for classifying serial numbers without conventional delimiters. + + Digit normalization within letter-digit tokens follows: + 1. 1-2 digits, or single digits followed by zeros -> cardinal + 2. 3 digits not ending in 00, or 4+ digits -> single-digit reading + 3. Digit-only tokens separated by ``/`` -> cardinal per segment (5+ digits stay single-digit) Args: - time: composed tagger and verbalizer - date: composed tagger and verbalizer - cardinal: tagger + cardinal: cardinal tagger + ordinal: ordinal tagger (used to exclude ordinal readings) deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) + for False multiple transduction are generated (used for audio-based normalization) lm: whether to use for hybrid LM """ @@ -48,31 +99,56 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = Finite state transducer for classifying serial (handles only cases without delimiters, values with delimiters are handled by default). The serial is a combination of digits, letters and dashes, e.g.: - c325b -> tokens { cardinal { integer: "c three two five b" } } + "H800" -> tokens { name: "H eight hundred" } + "a320b" -> tokens { name: "a three two zero b" } + "12/345/67890" -> tokens { name: "twelve/three hundred forty five/six seven eight nine zero" } + """ if deterministic: - num_graph = pynini.compose(NEMO_DIGIT ** (6, ...), cardinal.single_digits_graph).optimize() - num_graph |= pynini.compose(NEMO_DIGIT ** (1, 5), cardinal.graph).optimize() - # to handle numbers starting with zero - num_graph |= pynini.compose( - pynini.accep("0") + pynini.closure(NEMO_DIGIT), cardinal.single_digits_graph + num_graph_pure = ( + pynini.compose(NEMO_DIGIT ** (1, 3), cardinal.graph) + | pynini.compose(NEMO_DIGIT ** (4, ...), cardinal.single_digits_graph) + | _leading_zero_graph(cardinal) + ).optimize() + + num_graph_alnum = ( + pynini.compose(NEMO_DIGIT, cardinal.graph) + | pynini.compose(NEMO_DIGIT**2, cardinal.graph) + | pynutil.add_weight( + pynini.compose(NEMO_DIGIT + pynini.closure("0", 1), cardinal.graph), MIN_NEG_WEIGHT + ) + | pynini.compose( + pynini.difference(NEMO_DIGIT**3, NEMO_DIGIT + NEMO_DIGIT + "00"), cardinal.single_digits_graph + ) + | pynini.compose(NEMO_DIGIT ** (4, ...), cardinal.single_digits_graph) + | _leading_zero_graph(cardinal) + ).optimize() + + num_graph_slash = ( + pynini.compose(NEMO_DIGIT ** (1, 4), cardinal.graph) + | pynini.compose(NEMO_DIGIT ** (5, ...), cardinal.single_digits_graph) + | _leading_zero_graph(cardinal) ).optimize() + else: - num_graph = cardinal.final_graph + num_graph_pure = cardinal.final_graph + num_graph_alnum = cardinal.final_graph + num_graph_slash = cardinal.final_graph # TODO: "#" doesn't work from the file symbols_graph = pynini.string_file(get_abs_path("data/whitelist/symbol.tsv")).optimize() | pynini.cross( "#", "hash" ) - num_graph |= symbols_graph + num_graph_pure |= symbols_graph + num_graph_alnum |= symbols_graph if not self.deterministic and not lm: - num_graph |= cardinal.single_digits_graph - num_graph |= pynini.compose(num_graph, NEMO_SIGMA + pynutil.delete("hundred ") + NEMO_SIGMA) - # also allow double digits to be pronounced as integer in serial number - num_graph |= pynutil.add_weight( - NEMO_DIGIT**2 @ cardinal.graph_hundred_component_at_least_one_none_zero_digit, weight=0.0001 + num_graph_pure |= cardinal.single_digits_graph + num_graph_pure |= pynini.compose(num_graph_pure, NEMO_SIGMA + pynutil.delete("hundred ") + NEMO_SIGMA) + num_graph_pure |= pynutil.add_weight( + NEMO_DIGIT**2 @ cardinal.graph_hundred_component_at_least_one_none_zero_digit, weight=MIN_POS_WEIGHT ) + num_graph_alnum = num_graph_pure # add space between letter and digit/symbol symbols = [x[0] for x in load_labels(get_abs_path("data/whitelist/symbol.tsv"))] @@ -90,44 +166,21 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = delimiter |= pynini.cross("-", " dash ") | pynini.cross("/", " slash ") alphas = pynini.closure(NEMO_ALPHA, 1) - letter_num = alphas + delimiter + num_graph - num_letter = pynini.closure(num_graph + delimiter, 1) + alphas - next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph)) - next_alpha_or_num |= pynini.closure( - delimiter - + num_graph - + plurals._priority_union(pynini.accep(" "), pynutil.insert(" "), NEMO_SIGMA).optimize() - + alphas - ) - - serial_graph = letter_num + next_alpha_or_num - serial_graph |= num_letter + next_alpha_or_num - # numbers only with 2+ delimiters - serial_graph |= ( - num_graph + delimiter + num_graph + delimiter + num_graph + pynini.closure(delimiter + num_graph) - ) - # 2+ symbols - serial_graph |= pynini.compose(NEMO_SIGMA + symbols + NEMO_SIGMA, num_graph + delimiter + num_graph) - - # exclude ordinal numbers from serial options - serial_graph = pynini.compose( - pynini.difference(NEMO_SIGMA, pynini.project(ordinal.graph, "input")), serial_graph - ).optimize() - serial_graph = pynutil.add_weight(serial_graph, 0.0001) - serial_graph |= ( - pynini.closure(NEMO_NOT_SPACE, 1) - + (pynini.cross("^2", " squared") | pynini.cross("^3", " cubed")).optimize() - ) + serial_graph = _build_serial_graph(num_graph_pure, delimiter, alphas, ordinal) + serial_graph_alnum = _build_serial_graph(num_graph_alnum, delimiter, alphas, ordinal) - # at least one serial graph with alpha numeric value and optional additional serial/num/alpha values - serial_graph = ( - pynini.closure((serial_graph | num_graph | alphas) + delimiter) - + serial_graph - + pynini.closure(delimiter + (serial_graph | num_graph | alphas)) + # Rule 3: tokens that contain only digits and slashes (e.g. 31/31/100, 123/261788/2021). + slash_digit_token = ( + pynini.closure(NEMO_DIGIT, 1) + pynini.accep("/") + pynini.closure(NEMO_DIGIT | pynini.accep("/"), 0) ) + slash_serial = pynini.compose( + slash_digit_token, + pynini.closure(num_graph_slash + pynini.accep("/"), 1) + num_graph_slash, + ).optimize() + serial_graph |= pynutil.add_weight(slash_serial, MIN_NEG_WEIGHT) - serial_graph |= pynini.compose(graph_with_space, serial_graph.optimize()).optimize() + serial_graph |= pynini.compose(graph_with_space, serial_graph_alnum.optimize()).optimize() serial_graph = pynini.compose(pynini.closure(NEMO_NOT_SPACE, 2), serial_graph).optimize() # this is not to verbolize "/" as "slash" in cases like "import/export" diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_ordinal.txt index 2e1b5ec7e..d4f073525 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_ordinal.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_ordinal.txt @@ -24,4 +24,4 @@ 21th~twenty one th 121st~one hundred twenty first 111th~one hundred eleventh -111st~one hundred eleven st \ No newline at end of file +111st~one one one st \ No newline at end of file diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_serial.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_serial.txt index f0a6e0a3f..f142ceb7e 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_serial.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_serial.txt @@ -29,3 +29,5 @@ a 4-kilogram bag~a four-kilogram bag 100-car~one hundred-car 123/261788/2021~one hundred twenty three/two six one seven eight eight/two thousand twenty one 2*8~two asterisk eight +my pnr is t2000~my pnr is t two thousand +your otp is ab9453~your otp is ab nine four five three \ No newline at end of file