diff --git a/nemo_text_processing/text_normalization/fr/data/dates/__init__.py b/nemo_text_processing/text_normalization/fr/data/dates/__init__.py new file mode 100644 index 000000000..bc443be41 --- /dev/null +++ b/nemo_text_processing/text_normalization/fr/data/dates/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/fr/data/dates/eras.tsv b/nemo_text_processing/text_normalization/fr/data/dates/eras.tsv new file mode 100644 index 000000000..6127bea93 --- /dev/null +++ b/nemo_text_processing/text_normalization/fr/data/dates/eras.tsv @@ -0,0 +1,8 @@ +20s twenties +30s thirties +40s forties +50s fifties +60s sixties +70s seventies +80s eighties +90s nineties \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/fr/data/dates/months.tsv b/nemo_text_processing/text_normalization/fr/data/dates/months.tsv new file mode 100644 index 000000000..98a4e7d5d --- /dev/null +++ b/nemo_text_processing/text_normalization/fr/data/dates/months.tsv @@ -0,0 +1,12 @@ +1 janvier +2 février +3 mars +4 avril +5 mai +6 juin +7 juillet +8 août +9 septembre +10 octobre +11 novembre +12 décembre \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/fr/taggers/date.py b/nemo_text_processing/text_normalization/fr/taggers/date.py new file mode 100644 index 000000000..4c1f0afc4 --- /dev/null +++ b/nemo_text_processing/text_normalization/fr/taggers/date.py @@ -0,0 +1,150 @@ +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst +from nemo_text_processing.text_normalization.fr.utils import get_abs_path + +# TODO: add articles? 'le...' + +month_numbers = pynini.string_file(get_abs_path("data/dates/months.tsv")) +eras = pynini.string_file(get_abs_path("data/dates/eras.tsv")) +delete_leading_zero = ( + pynutil.delete("0") | (NEMO_DIGIT - "0") +) + NEMO_DIGIT # reminder, NEMO_DIGIT = filter on digits + + +class DateFst(GraphFst): + ''' Finite state transducer for classyfing dates, e.g.: + '02.03.2003' -> date {day: 'deux' month: 'mai' year: 'deux mille trois' preserve order: true} + ''' + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="dates", kind="classify") + + cardinal_graph = cardinal.all_nums_no_tokens + + # 'le' -> 'le', 'les' -> 'les' + le_determiner = pynini.accep("le ") | pynini.accep("les ") + self.optional_le = pynini.closure(le_determiner, 0, 1) + + # '01' -> 'un' + optional_leading_zero = delete_leading_zero | NEMO_DIGIT + valid_day_number = pynini.union(*[str(x) for x in range(1, 32)]) + premier = pynini.string_map([("1", "premier")]) + day_number_to_word = premier | cardinal_graph + + digit_to_day = self.optional_le + optional_leading_zero @ valid_day_number @ day_number_to_word + self.day_graph = pynutil.insert("day: \"") + digit_to_day + pynutil.insert("\"") + + # '03' -> 'mars' + normalize_month_number = optional_leading_zero @ pynini.union(*[str(x) for x in range(1, 13)]) + number_to_month = month_numbers.optimize() + month_graph = normalize_month_number @ number_to_month + self.month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"") + + # 2025 -> deux mille vingt cinq + accept_year_digits = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 1, 3) + digits_to_year = accept_year_digits @ cardinal_graph + self.year_graph = pynutil.insert("year: \"") + digits_to_year + pynutil.insert("\"") + + # Putting it all together + self.fst = pynini.accep("") + + for separator in ["/", ".", "-"]: + self.fst |= ( + pynutil.insert("date { ") + + self.day_graph + + pynutil.delete(separator) + + pynutil.insert(" ") + + self.month_graph + + pynini.closure(pynutil.delete(separator) + pynutil.insert(" ") + self.year_graph, 0, 1) + + pynutil.insert(" preserve_order: true }") + ) + + # Accepts "janvier", "février", etc + month_name_graph = pynutil.insert("month: \"") + month_numbers.project("output") + pynutil.insert("\"") + + self.fst |= ( + pynutil.insert("date { ") + + self.day_graph + + pynini.accep(" ") + + month_name_graph + + pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1) + + pynutil.insert(" preserve_order: true}") + ) + + # Accepts "70s", "80s", etc + self.fst |= pynutil.insert("date { year: \"") + eras + pynutil.insert("\" preserve_order: true }") + + # Accepts date ranges, "17-18-19 juin" -> date { day: "17" day: "18": day: "19"} + for separator in ["-", "/"]: + day_range_graph = ( + pynutil.insert("day: \"") + + pynini.closure(digit_to_day + pynutil.delete(separator) + pynutil.insert(" "), 1) + + digit_to_day + + pynutil.insert("\"") + ) + + self.fst |= ( + pynutil.insert("date { ") + + day_range_graph + + pynini.accep(" ") + + month_name_graph + + pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1) + + pynutil.insert(" preserve_order: true }") + ) + + self.fst = self.fst.optimize() + + +def apply_fst(text, fst): + try: + output = pynini.shortestpath(text @ fst).string() + print(f"'{text}' --> '{output}'") + except pynini.FstOpError: + print(f"Error: No valid output with given input: '{text}'") + + +if __name__ == "__main__": + from nemo_text_processing.text_normalization.fr.taggers.cardinal import CardinalFst + + fst = DateFst(CardinalFst()) + + print('DETERMINER') + apply_fst("le ", fst.optional_le) + apply_fst("", fst.optional_le) + + print("\nDAY GRAPH") + apply_fst("01", fst.day_graph) + apply_fst("02", fst.day_graph) + apply_fst("3", fst.day_graph) + apply_fst("12", fst.day_graph) + apply_fst("le 01", fst.day_graph) + apply_fst("le 12", fst.day_graph) + + print("\nMONTH GRAPH") + apply_fst("1", fst.month_graph) + apply_fst("3", fst.month_graph) + apply_fst("06", fst.month_graph) + + print("\nYEAR") + apply_fst("2025", fst.year_graph) + + print("\nDATE") + apply_fst("02.03.2003", fst.fst) + apply_fst("02/03/2003", fst.fst) + apply_fst("02-03-2003", fst.fst) + apply_fst("le 02.03.2003", fst.fst) + + apply_fst("02.03", fst.fst) + apply_fst("17 janvier", fst.fst) + apply_fst("10 mars 2023", fst.fst) + apply_fst("le 10 mars 2023", fst.fst) + + print("\nERAS") + apply_fst("80s", fst.fst) + + print("\nDATE RANGES") + apply_fst( + "les 17/18/19 juin", fst.fst + ) # returns: date { day: "les dix-sept" day: "dix-huit" day: "dix-neuf" month: "juin" preserve_order: true } diff --git a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py index de9a0b047..cacc94bcf 100644 --- a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py @@ -26,6 +26,7 @@ ) from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst from nemo_text_processing.text_normalization.fr.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.fr.taggers.date import DateFst from nemo_text_processing.text_normalization.fr.taggers.decimals import DecimalFst from nemo_text_processing.text_normalization.fr.taggers.fraction import FractionFst from nemo_text_processing.text_normalization.fr.taggers.ordinal import OrdinalFst @@ -86,8 +87,12 @@ def __init__( whitelist_graph = self.whitelist.fst punct_graph = PunctuationFst(deterministic=deterministic).fst + self.date = DateFst(self.cardinal, deterministic=deterministic) + date_graph = self.date.fst + classify = ( pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(date_graph, 1.1) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.09) | pynutil.add_weight(ordinal_graph, 1.1) diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/date.py b/nemo_text_processing/text_normalization/fr/verbalizers/date.py new file mode 100644 index 000000000..332d80bd6 --- /dev/null +++ b/nemo_text_processing/text_normalization/fr/verbalizers/date.py @@ -0,0 +1,67 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_NOT_QUOTE, + NEMO_SPACE, + GraphFst, + delete_preserve_order, +) + + +class DateFst(GraphFst): + """ + Finite state transducer for verbalizing date, e.g. + date {day: "deux" month: "mars" year: "deux mille trois" preserve_order: true} -> deux mars deux mille trois + + Args: + ordinal: OrdinalFst + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="date", kind="verbalize", deterministic=deterministic) + + day = pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + month = pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + decade = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + + graph_dmy = day + NEMO_SPACE + month + pynini.closure(NEMO_SPACE + year, 0, 1) + delete_preserve_order + graph_my = month + NEMO_SPACE + year + delete_preserve_order + graph_decade = decade + delete_preserve_order + + self.graph = graph_dmy | graph_my | graph_decade + + delete_tokens = self.delete_tokens(self.graph) + self.fst = delete_tokens.optimize() + + +def apply_fst(text, fst): + try: + output = pynini.shortestpath(text @ fst).string() + print(f"'{text}' --> '{output}'") + except pynini.FstOpError: + print(f"Error: No valid output with given input: '{text}'") + + +if __name__ == "__main__": + fst = DateFst() + + # tagger output for "eighties" + apply_fst('date { year: "eighties" preserve_order: true }', fst.fst) diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/fr/verbalizers/verbalize.py index 02510ea5f..3ea0117af 100644 --- a/nemo_text_processing/text_normalization/fr/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/fr/verbalizers/verbalize.py @@ -14,6 +14,7 @@ from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from nemo_text_processing.text_normalization.en.verbalizers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.fr.verbalizers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.fr.verbalizers.date import DateFst from nemo_text_processing.text_normalization.fr.verbalizers.decimals import DecimalFst from nemo_text_processing.text_normalization.fr.verbalizers.fraction import FractionFst from nemo_text_processing.text_normalization.fr.verbalizers.ordinal import OrdinalFst @@ -40,6 +41,8 @@ def __init__(self, deterministic: bool = True): fraction = FractionFst(ordinal=ordinal, deterministic=deterministic) fraction_graph = fraction.fst whitelist_graph = WhiteListFst(deterministic=deterministic).fst + date = DateFst(deterministic=deterministic) + date_graph = date.fst - graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph + graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph | date_graph self.fst = graph diff --git a/nemo_text_processing/text_normalization/fr_tutorial/__init__.py b/nemo_text_processing/text_normalization/fr_tutorial/__init__.py new file mode 100644 index 000000000..6ebc808fa --- /dev/null +++ b/nemo_text_processing/text_normalization/fr_tutorial/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/fr_tutorial/data/__init__.py b/nemo_text_processing/text_normalization/fr_tutorial/data/__init__.py new file mode 100644 index 000000000..6ebc808fa --- /dev/null +++ b/nemo_text_processing/text_normalization/fr_tutorial/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/fr_tutorial/data/numbers/digits.tsv b/nemo_text_processing/text_normalization/fr_tutorial/data/numbers/digits.tsv new file mode 100644 index 000000000..a1e73f012 --- /dev/null +++ b/nemo_text_processing/text_normalization/fr_tutorial/data/numbers/digits.tsv @@ -0,0 +1,11 @@ +zéro 0 +un 1 +une 1 +deux 2 +trois 3 +quatre 4 +cinq 5 +six 6 +sept 7 +huit 8 +neuf 9 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/fr_tutorial/data/whitelist.tsv b/nemo_text_processing/text_normalization/fr_tutorial/data/whitelist.tsv new file mode 100644 index 000000000..dc563bdab --- /dev/null +++ b/nemo_text_processing/text_normalization/fr_tutorial/data/whitelist.tsv @@ -0,0 +1,13 @@ +Mᵐᵉ madame +Mᵐᵉˢ mesdames +Mˡˡᵉ mademoiselle +Mˡˡᵉˢ mademoiselles +Dʳ docteur +Dʳˢ docteurs +Dʳᵉ docteure +Dʳᵉˢ docteures +apr. J.-C. après jésus-christ +av. J.-C. avant Jésus-Christ +le hon. l’honorable +le très hon. le très hononrable +% pour cent \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/fr_tutorial/taggers/my_test_script.py b/nemo_text_processing/text_normalization/fr_tutorial/taggers/my_test_script.py new file mode 100644 index 000000000..0c017e70a --- /dev/null +++ b/nemo_text_processing/text_normalization/fr_tutorial/taggers/my_test_script.py @@ -0,0 +1,49 @@ +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.fr.utils import get_abs_path + + +def apply_fst(text, fst): + """ Given a string input, returns the output string + produced by traversing the path with lowest weight. + If no valid path accepts input string, returns an + error. + """ + try: + print(pynini.shortestpath(text @ fst).string()) + except pynini.FstOpError: + print(f"Error: No valid output with given input: '{text}'") + + +zero = pynini.string_map([("zéro", "0")]) # French only pronounces zeroes as stand alone +digits_map = pynini.string_map( + [ # pynini function that creates explicit input-output mappings for a WFST + ("un", "1"), + ("une", "1"), + ("deux", "2"), + ("trois", "3"), + ("quatre", "4"), + ("cinq", "5"), + ("six", "6"), + ("sept", "7"), + ("huit", "8"), + ("neuf", "9"), + ] +) + +digits = pynini.string_file("data/numbers/digits.tsv") + +teens = pynini.string_map([("onze", "11"), ("douze", "12"), ("treize", "13"), ("quatorze", "14"), ("quinze", "16"),]) + +tens = pynini.string_map([("dix", "1")]) +delete_hyphen = pynini.closure( + pynutil.delete("-"), 0, 1 +) # Applies a closure from 0-1 of operation. Equivalent to regex /?/ + +graph_tens = tens + delete_hyphen + digits +graph_tens_and_teens = graph_tens | teens + +graph_digits = digits | pynutil.insert("0") + +apply_fst("un", graph_tens_and_teens) diff --git a/nemo_text_processing/text_normalization/fr_tutorial/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/fr_tutorial/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..de9a0b047 --- /dev/null +++ b/nemo_text_processing/text_normalization/fr_tutorial/taggers/tokenize_and_classify.py @@ -0,0 +1,116 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_WHITE_SPACE, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.fr.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.fr.taggers.decimals import DecimalFst +from nemo_text_processing.text_normalization.fr.taggers.fraction import FractionFst +from nemo_text_processing.text_normalization.fr.taggers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.fr.taggers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.fr.taggers.word import WordFst +from nemo_text_processing.utils.logging import logger + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased. + For deployment, this grammar will be compiled and exported to OpenFst Finate State aRchive (FAR) File. + More details to deployment at NeMo-text-processing/tools/text_processing_deployment. + Args: + input_case: accepting either "lower_cased" or "cased" input. + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with whitelist replacements + """ + + def __init__( + self, + input_case: str, + deterministic: bool = False, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + ): + super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + whitelist_file = os.path.basename(whitelist) if whitelist else "" + far_file = os.path.join( + cache_dir, f"_{input_case}_fr_tn_{deterministic}_deterministic{whitelist_file}.far", + ) + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logger.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logger.info(f"Creating ClassifyFst grammars. This might take some time...") + + self.cardinal = CardinalFst(deterministic=deterministic) + cardinal_graph = self.cardinal.fst + + self.ordinal = OrdinalFst(cardinal=self.cardinal, deterministic=deterministic) + ordinal_graph = self.ordinal.fst + + self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) + decimal_graph = self.decimal.fst + + self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,) + fraction_graph = self.fraction.fst + word_graph = WordFst(deterministic=deterministic).fst + self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) + whitelist_graph = self.whitelist.fst + punct_graph = PunctuationFst(deterministic=deterministic).fst + + classify = ( + pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(fraction_graph, 1.09) + | pynutil.add_weight(ordinal_graph, 1.1) + | pynutil.add_weight(decimal_graph, 1.1) + | pynutil.add_weight(word_graph, 200) + ) + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") + punct = pynini.closure( + pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) + | (pynutil.insert(" ") + punct), + 1, + ) + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + token_plus_punct = ( + pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + ) + + graph = token_plus_punct + pynini.closure((delete_extra_space).ques + token_plus_punct) + graph = delete_space + graph + delete_space + graph |= punct + + self.fst = graph.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) + logger.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/text_normalization/fr_tutorial/utils.py b/nemo_text_processing/text_normalization/fr_tutorial/utils.py new file mode 100644 index 000000000..7523e5762 --- /dev/null +++ b/nemo_text_processing/text_normalization/fr_tutorial/utils.py @@ -0,0 +1,43 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import os + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + +def load_labels(abs_path): + """ + loads relative path file as dictionary + + Args: + abs_path: absolute path + + Returns dictionary of mappings + """ + label_tsv = open(abs_path) + labels = list(csv.reader(label_tsv, delimiter="\t")) + label_tsv.close() + return labels diff --git a/nemo_text_processing/text_normalization/fr_tutorial/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/fr_tutorial/verbalizers/verbalize.py new file mode 100644 index 000000000..02510ea5f --- /dev/null +++ b/nemo_text_processing/text_normalization/fr_tutorial/verbalizers/verbalize.py @@ -0,0 +1,45 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.fr.verbalizers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.fr.verbalizers.decimals import DecimalFst +from nemo_text_processing.text_normalization.fr.verbalizers.fraction import FractionFst +from nemo_text_processing.text_normalization.fr.verbalizers.ordinal import OrdinalFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + More details to deployment at NeMo-text-processing/tools/text_processing_deployment. + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="verbalize", kind="verbalize", deterministic=deterministic) + cardinal = CardinalFst(deterministic=deterministic) + cardinal_graph = cardinal.fst + ordinal = OrdinalFst(deterministic=deterministic) + ordinal_graph = ordinal.fst + decimal = DecimalFst(deterministic=deterministic) + decimal_graph = decimal.fst + fraction = FractionFst(ordinal=ordinal, deterministic=deterministic) + fraction_graph = fraction.fst + whitelist_graph = WhiteListFst(deterministic=deterministic).fst + + graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph + self.fst = graph diff --git a/nemo_text_processing/text_normalization/fr_tutorial/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/fr_tutorial/verbalizers/verbalize_final.py new file mode 100644 index 000000000..0313f7f5b --- /dev/null +++ b/nemo_text_processing/text_normalization/fr_tutorial/verbalizers/verbalize_final.py @@ -0,0 +1,70 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import ( + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst +from nemo_text_processing.text_normalization.fr.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.utils.logging import logger + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"fr_tn_{deterministic}_deterministic_verbalizer.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["verbalize"] + logger.info(f'VerbalizeFinalFst graph was restored from {far_file}.') + else: + + verbalize = VerbalizeFst(deterministic=deterministic).fst + word = WordFst(deterministic=deterministic).fst + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + + self.fst = graph.optimize() + if far_file: + generator_main(far_file, {"verbalize": self.fst}) + logger.info(f"VerbalizeFinalFst grammars are saved to {far_file}.") diff --git a/tests/nemo_text_processing/fr/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/fr/data_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..3b4f09154 --- /dev/null +++ b/tests/nemo_text_processing/fr/data_text_normalization/test_cases_date.txt @@ -0,0 +1,13 @@ +02.03.2003~deux mars deux mille trois +02/03/2003~deux mars deux mille trois +02-03-2003~deux mars deux mille trois +le 02.03.2003~le deux mars deux mille trois +17.06~dix-sept juin +17 janvier~dix-sept janvier +10 mars 2023~dix mars deux mille vingt-trois +le 10 mars 2023~le dix mars deux mille vingt-trois +les 80s~les eighties +les 17/18 juin~les dix-sept dix-huit juin +les 17/18/19 mars~les dix-sept dix-huit dix-neuf mars +les 17-18-19 juin~les dix-sept dix-huit dix-neuf juin +les 17-18-19 juin 2025~les dix-sept dix-huit dix-neuf juin deux mille vingt-cinq \ No newline at end of file diff --git a/tests/nemo_text_processing/fr/test_date.py b/tests/nemo_text_processing/fr/test_date.py index 614ed0e24..35e3086cd 100644 --- a/tests/nemo_text_processing/fr/test_date.py +++ b/tests/nemo_text_processing/fr/test_date.py @@ -16,6 +16,7 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -29,3 +30,12 @@ class TestDate: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(input_case='cased', lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('fr/data_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh index 009032118..313c5b06c 100644 --- a/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh @@ -1,7 +1,7 @@ #! /bin/sh GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} -PROJECT_DIR=${2:-"/workspace/tests/en"} +PROJECT_DIR=${2:-"/workspace/tests"} runtest () { input=$1 @@ -52,5 +52,10 @@ testTNWord() { runtest $input } +testTNDate() { + input=$PROJECT_DIR/fr/data_text_normalization/test_cases_date.txt + runtest $input +} + # Load shUnit2 . $PROJECT_DIR/../shunit2/shunit2 \ No newline at end of file