Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ pipeline {
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-04-24-0'
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-25-24-0'
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0'
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-07-25-0'
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
Expand Down
13 changes: 13 additions & 0 deletions nemo_text_processing/text_normalization/fr/data/dates/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
20s twenties
30s thirties
40s forties
50s fifties
60s sixties
70s seventies
80s eighties
90s nineties
12 changes: 12 additions & 0 deletions nemo_text_processing/text_normalization/fr/data/dates/months.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
1 janvier
2 février
3 mars
4 avril
5 mai
6 juin
7 juillet
8 août
9 septembre
10 octobre
11 novembre
12 décembre
97 changes: 97 additions & 0 deletions nemo_text_processing/text_normalization/fr/taggers/date.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst
from nemo_text_processing.text_normalization.fr.utils import get_abs_path

# TODO: add articles? 'le...'

month_numbers = pynini.string_file(get_abs_path("data/dates/months.tsv"))
eras = pynini.string_file(get_abs_path("data/dates/eras.tsv"))
delete_leading_zero = (
pynutil.delete("0") | (NEMO_DIGIT - "0")
) + NEMO_DIGIT # reminder, NEMO_DIGIT = filter on digits


class DateFst(GraphFst):
''' Finite state transducer for classyfing dates, e.g.:
'02.03.2003' -> date {day: 'deux' month: 'mai' year: 'deux mille trois' preserve order: true}
'''

def __init__(self, cardinal: GraphFst, deterministic: bool = True):
super().__init__(name="dates", kind="classify")

cardinal_graph = cardinal.all_nums_no_tokens

# 'le' -> 'le', 'les' -> 'les'
le_determiner = pynini.accep("le ") | pynini.accep("les ")
self.optional_le = pynini.closure(le_determiner, 0, 1)

# '01' -> 'un'
optional_leading_zero = delete_leading_zero | NEMO_DIGIT
valid_day_number = pynini.union(*[str(x) for x in range(1, 32)])
premier = pynini.string_map([("1", "premier")])
day_number_to_word = premier | cardinal_graph

digit_to_day = self.optional_le + optional_leading_zero @ valid_day_number @ day_number_to_word
self.day_graph = pynutil.insert("day: \"") + digit_to_day + pynutil.insert("\"")

# '03' -> 'mars'
normalize_month_number = optional_leading_zero @ pynini.union(*[str(x) for x in range(1, 13)])
number_to_month = month_numbers.optimize()
month_graph = normalize_month_number @ number_to_month
self.month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")

# 2025 -> deux mille vingt cinq
accept_year_digits = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 1, 3)
digits_to_year = accept_year_digits @ cardinal_graph
self.year_graph = pynutil.insert("year: \"") + digits_to_year + pynutil.insert("\"")

# Putting it all together
self.fst = pynini.accep("")

for separator in ["/", ".", "-"]:
self.fst |= (
pynutil.insert("date { ")
+ self.day_graph
+ pynutil.delete(separator)
+ pynutil.insert(" ")
+ self.month_graph
+ pynini.closure(pynutil.delete(separator) + pynutil.insert(" ") + self.year_graph, 0, 1)
+ pynutil.insert(" preserve_order: true }")
)

# Accepts "janvier", "février", etc
month_name_graph = pynutil.insert("month: \"") + month_numbers.project("output") + pynutil.insert("\"")

self.fst |= (
pynutil.insert("date { ")
+ self.day_graph
+ pynini.accep(" ")
+ month_name_graph
+ pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1)
+ pynutil.insert(" preserve_order: true}")
)

# Accepts "70s", "80s", etc
self.fst |= pynutil.insert("date { year: \"") + eras + pynutil.insert("\" preserve_order: true }")

# Accepts date ranges, "17-18-19 juin" -> date { day: "17" day: "18": day: "19"}
for separator in ["-", "/"]:
day_range_graph = (
pynutil.insert("day: \"")
+ pynini.closure(digit_to_day + pynutil.delete(separator) + pynutil.insert(" "), 1)
+ digit_to_day
+ pynutil.insert("\"")
)

self.fst |= (
pynutil.insert("date { ")
+ day_range_graph
+ pynini.accep(" ")
+ month_name_graph
+ pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1)
+ pynutil.insert(" preserve_order: true }")
)

self.fst = self.fst.optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
)
from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst
from nemo_text_processing.text_normalization.fr.taggers.cardinal import CardinalFst
from nemo_text_processing.text_normalization.fr.taggers.date import DateFst
from nemo_text_processing.text_normalization.fr.taggers.decimals import DecimalFst
from nemo_text_processing.text_normalization.fr.taggers.fraction import FractionFst
from nemo_text_processing.text_normalization.fr.taggers.ordinal import OrdinalFst
Expand Down Expand Up @@ -86,8 +87,12 @@ def __init__(
whitelist_graph = self.whitelist.fst
punct_graph = PunctuationFst(deterministic=deterministic).fst

self.date = DateFst(self.cardinal, deterministic=deterministic)
date_graph = self.date.fst

classify = (
pynutil.add_weight(whitelist_graph, 1.01)
| pynutil.add_weight(date_graph, 1.1)
| pynutil.add_weight(cardinal_graph, 1.1)
| pynutil.add_weight(fraction_graph, 1.09)
| pynutil.add_weight(ordinal_graph, 1.1)
Expand Down
51 changes: 51 additions & 0 deletions nemo_text_processing/text_normalization/fr/verbalizers/date.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_NOT_QUOTE,
NEMO_SPACE,
GraphFst,
delete_preserve_order,
)


class DateFst(GraphFst):
"""
Finite state transducer for verbalizing date, e.g.
date {day: "deux" month: "mars" year: "deux mille trois" preserve_order: true} -> deux mars deux mille trois
Args:
ordinal: OrdinalFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""

def __init__(self, deterministic: bool = True):
super().__init__(name="date", kind="verbalize", deterministic=deterministic)

day = pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
month = pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
decade = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")

graph_dmy = day + NEMO_SPACE + month + pynini.closure(NEMO_SPACE + year, 0, 1) + delete_preserve_order
graph_my = month + NEMO_SPACE + year + delete_preserve_order
graph_decade = decade + delete_preserve_order

self.graph = graph_dmy | graph_my | graph_decade

delete_tokens = self.delete_tokens(self.graph)
self.fst = delete_tokens.optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from nemo_text_processing.text_normalization.en.graph_utils import GraphFst
from nemo_text_processing.text_normalization.en.verbalizers.whitelist import WhiteListFst
from nemo_text_processing.text_normalization.fr.verbalizers.cardinal import CardinalFst
from nemo_text_processing.text_normalization.fr.verbalizers.date import DateFst
from nemo_text_processing.text_normalization.fr.verbalizers.decimals import DecimalFst
from nemo_text_processing.text_normalization.fr.verbalizers.fraction import FractionFst
from nemo_text_processing.text_normalization.fr.verbalizers.ordinal import OrdinalFst
Expand All @@ -40,6 +41,8 @@ def __init__(self, deterministic: bool = True):
fraction = FractionFst(ordinal=ordinal, deterministic=deterministic)
fraction_graph = fraction.fst
whitelist_graph = WhiteListFst(deterministic=deterministic).fst
date = DateFst(deterministic=deterministic)
date_graph = date.fst

graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph
graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph | date_graph
self.fst = graph
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
02.03.2003~deux mars deux mille trois
02/03/2003~deux mars deux mille trois
02-03-2003~deux mars deux mille trois
le 02.03.2003~le deux mars deux mille trois
17.06~dix-sept juin
17 janvier~dix-sept janvier
10 mars 2023~dix mars deux mille vingt-trois
le 10 mars 2023~le dix mars deux mille vingt-trois
les 80s~les eighties
les 17/18 juin~les dix-sept dix-huit juin
les 17/18/19 mars~les dix-sept dix-huit dix-neuf mars
les 17-18-19 juin~les dix-sept dix-huit dix-neuf juin
les 17-18-19 juin 2025~les dix-sept dix-huit dix-neuf juin deux mille vingt-cinq
10 changes: 10 additions & 0 deletions tests/nemo_text_processing/fr/test_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from parameterized import parameterized

from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
from nemo_text_processing.text_normalization.normalize import Normalizer

from ..utils import CACHE_DIR, parse_test_case_file

Expand All @@ -29,3 +30,12 @@ class TestDate:
def test_denorm(self, test_input, expected):
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
assert pred == expected

normalizer = Normalizer(input_case='cased', lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False)

@parameterized.expand(parse_test_case_file('fr/data_text_normalization/test_cases_date.txt'))
@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_norm(self, test_input, expected):
pred = self.normalizer.normalize(test_input, verbose=False)
assert pred == expected
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ testTNCardinal() {
runtest $input
}

testTNDate() {
input=$PROJECT_DIR/fr/data_text_normalization/test_cases_date.txt
runtest $input
}

testTNDecimal() {
input=$PROJECT_DIR/fr/data_text_normalization/test_cases_decimal.txt
runtest $input
Expand Down
Loading