close
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ pipeline {
VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-30-24-0'
IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0'
IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
Expand Down
50 changes: 33 additions & 17 deletions nemo_text_processing/text_normalization/en/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@
NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize()
NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
NEMO_HEX = pynini.union(*string.hexdigits).optimize()
NEMO_NON_BREAKING_SPACE = u"\u00A0"
NEMO_NON_BREAKING_SPACE = "\u00A0"
NEMO_SPACE = " "
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize()
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize()
NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()

Expand Down Expand Up @@ -79,20 +79,36 @@
delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
delete_preserve_order = pynini.closure(
pynutil.delete(" preserve_order: true")
| (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\""))
| (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"'))
)

# Common string literals; expand as you see fit.
username_string = "username"
double_quotes = '"'
domain_string = "domain"
protocol_string = "protocol"
slash = "/"
double_slash = "//"
triple_slash = "///"
file = "file"
period = "."
at = "@"
colon = ":"
https = "https"
http = "http"
www = "www"

suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
# _v = pynini.union("a", "e", "i", "o", "u")
_c = pynini.union(
"b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z"
"b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z",
)
_ies = NEMO_SIGMA + _c + pynini.cross("y", "ies")
_es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es")
_s = NEMO_SIGMA + pynutil.insert("s")

graph_plural = plurals._priority_union(
suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA
suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA,
).optimize()

SINGULAR_TO_PLURAL = graph_plural
Expand All @@ -107,8 +123,8 @@


def capitalized_input_graph(
graph: 'pynini.FstLike', original_graph_weight: float = None, capitalized_graph_weight: float = None
) -> 'pynini.FstLike':
graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None,
) -> "pynini.FstLike":
"""
Allow graph input to be capitalized, e.g. for ITN)

Expand All @@ -129,7 +145,7 @@ def capitalized_input_graph(
return graph


def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']):
def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]):
"""
Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.

Expand All @@ -141,7 +157,7 @@ def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']):
for rule, graph in graphs.items():
exporter[rule] = graph.optimize()
exporter.close()
logger.info(f'Created {file_name}')
logger.info(f"Created {file_name}")


def get_plurals(fst):
Expand All @@ -168,7 +184,7 @@ def get_singulars(fst):
return PLURAL_TO_SINGULAR @ fst


def convert_space(fst) -> 'pynini.FstLike':
def convert_space(fst) -> "pynini.FstLike":
"""
Converts space to nonbreaking space.
Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
Expand All @@ -191,7 +207,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED):
written_capitalized = written[0].upper() + written[1:]
additional_labels.extend(
[
[written_capitalized, spoken.capitalize()], # first letter capitalized
[written_capitalized, spoken.capitalize(),], # first letter capitalized
[
written_capitalized,
spoken.upper().replace(" AND ", " and "),
Expand All @@ -205,7 +221,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED):
logger.debug(f"This is weight {weight}")
if len(weight) == 0:
additional_labels.extend(
[[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]]
[[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],]
)
else:
additional_labels.extend(
Expand Down Expand Up @@ -237,7 +253,7 @@ def __init__(self, name: str, kind: str, deterministic: bool = True):
self._fst = None
self.deterministic = deterministic

self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far')
self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far")
if self.far_exist():
self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst()

Expand All @@ -248,14 +264,14 @@ def far_exist(self) -> bool:
return self.far_path.exists()

@property
def fst(self) -> 'pynini.FstLike':
def fst(self) -> "pynini.FstLike":
return self._fst

@fst.setter
def fst(self, fst):
self._fst = fst

def add_tokens(self, fst) -> 'pynini.FstLike':
def add_tokens(self, fst) -> "pynini.FstLike":
"""
Wraps class name around to given fst

Expand All @@ -267,7 +283,7 @@ def add_tokens(self, fst) -> 'pynini.FstLike':
"""
return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")

def delete_tokens(self, fst) -> 'pynini.FstLike':
def delete_tokens(self, fst) -> "pynini.FstLike":
"""
Deletes class name wrap around output of given fst

Expand All @@ -286,4 +302,4 @@ def delete_tokens(self, fst) -> 'pynini.FstLike':
+ delete_space
+ pynutil.delete("}")
)
return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)
return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA)
85 changes: 55 additions & 30 deletions nemo_text_processing/text_normalization/it/taggers/electronic.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,26 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, insert_space
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_SPACE,
GraphFst,
at,
colon,
domain_string,
double_quotes,
double_slash,
http,
https,
period,
protocol_string,
username_string,
www,
)
from nemo_text_processing.text_normalization.it.utils import get_abs_path, load_labels

common_domains = [x[0] for x in load_labels(get_abs_path("data/electronic/domain.tsv"))]
symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))]
# common_domains = [x[0] for x in load_labels(get_abs_path("data/electronic/domain.tsv"))]


class ElectronicFst(GraphFst):
Expand All @@ -37,44 +52,54 @@ def __init__(self, deterministic: bool = True):
super().__init__(name="electronic", kind="classify", deterministic=deterministic)

dot = pynini.accep(".")
accepted_common_domains = pynini.union(*common_domains)
accepted_symbols = pynini.union(*symbols) - dot
accepted_characters = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols)
acceepted_characters_with_dot = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols | dot)

# e-mail
username = (
pynutil.insert("username: \"")
+ acceepted_characters_with_dot
+ pynutil.insert("\"")
+ pynini.cross('@', ' ')
)
symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))]
symbols = pynini.union(*symbols)
symbols_no_period = pynini.difference(symbols, dot)
accepted_characters = pynini.closure((NEMO_ALPHA | NEMO_DIGIT | symbols_no_period), 1)
all_characters = pynini.closure((NEMO_ALPHA | NEMO_DIGIT | symbols), 1)

domain_graph = accepted_characters + dot + accepted_characters
# domains
domain = dot + accepted_characters
domain_graph = (
pynutil.insert("domain: \"")
+ domain_graph
+ pynini.closure((accepted_symbols | dot) + pynini.closure(accepted_characters, 1), 0, 1)
+ pynutil.insert("\"")
pynutil.insert(domain_string + colon + NEMO_SPACE + double_quotes)
+ (accepted_characters + pynini.closure(domain, 1))
+ pynutil.insert(double_quotes)
)

domain_common_graph = (
pynutil.insert("domain: \"")
+ accepted_characters
+ accepted_common_domains
+ pynini.closure((accepted_symbols | dot) + pynini.closure(accepted_characters, 1), 0, 1)
+ pynutil.insert("\"")
# email
username = (
pynutil.insert(username_string + colon + NEMO_SPACE + double_quotes)
+ all_characters
+ pynutil.insert(double_quotes)
+ pynini.cross(at, NEMO_SPACE)
)
email = username + domain_graph

graph = (username + domain_graph) | domain_common_graph
# social media tags
tag = (
pynutil.delete(at)
+ pynutil.insert(username_string + colon + NEMO_SPACE + double_quotes)
+ (accepted_characters | (accepted_characters + pynini.closure(domain, 1)))
+ pynutil.insert(double_quotes)
)

protocol_start = pynini.accep("https://") | pynini.accep("http://")
# url
protocol_start = pynini.accep(https + colon + double_slash) | pynini.accep(http + colon + double_slash)
protocol_end = (
pynini.accep("www.") if deterministic else pynini.accep("www.") | pynini.cross("www.", "vu vu vu.")
pynini.accep(www + period)
if deterministic
else pynini.accep(www + period) | pynini.cross((www + period), "vu vu vu.")
)
protocol = protocol_start | protocol_end | (protocol_start + protocol_end)
protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert("\"")
graph |= protocol + insert_space + (domain_graph | domain_common_graph)
protocol = (
pynutil.insert(protocol_string + colon + NEMO_SPACE + double_quotes)
+ protocol
+ pynutil.insert(double_quotes)
)
url = protocol + pynutil.insert(NEMO_SPACE) + (domain_graph)

graph = url | domain_graph | email | tag
self.graph = graph

final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def __init__(
os.makedirs(cache_dir, exist_ok=True)
whitelist_file = os.path.basename(whitelist) if whitelist else ""
far_file = os.path.join(
cache_dir, f"_{input_case}_it_tn_{deterministic}_deterministic{whitelist_file}.far"
cache_dir, f"_{input_case}_it_tn_{deterministic}_deterministic{whitelist_file}.far",
)
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
Expand All @@ -86,10 +86,10 @@ def __init__(
self.electronic = ElectronicFst(deterministic=deterministic)
electronic_graph = self.electronic.fst

self.measure = MeasureFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic)
self.measure = MeasureFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,)
measure_graph = self.measure.fst

self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic)
self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,)
money_graph = self.money.fst

self.time = TimeFst(deterministic=deterministic)
Expand All @@ -98,17 +98,17 @@ def __init__(
punct_graph = PunctuationFst(deterministic=deterministic).fst

classify = (
pynutil.add_weight(whitelist_graph, 1)
pynutil.add_weight(whitelist_graph, 0.0)
| pynutil.add_weight(cardinal_graph, 1.1)
| pynutil.add_weight(decimal_graph, 1.1)
| pynutil.add_weight(electronic_graph, 1.09)
| pynutil.add_weight(measure_graph, 1.09)
| pynutil.add_weight(money_graph, 1.09)
| pynutil.add_weight(time_graph, 1.09)
| pynutil.add_weight(electronic_graph, 1.1)
| pynutil.add_weight(measure_graph, 1.1)
| pynutil.add_weight(money_graph, 1.1)
| pynutil.add_weight(time_graph, 1.1)
| pynutil.add_weight(word_graph, 100)
)

punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")
punct = pynini.closure(
pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space)
| (pynutil.insert(" ") + punct),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,17 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import (
from nemo_text_processing.text_normalization.en.graph_utils import ( # Common string literals; expand as you see fit.
NEMO_NOT_QUOTE,
NEMO_SIGMA,
NEMO_SPACE,
GraphFst,
colon,
delete_preserve_order,
insert_space,
domain_string,
double_quotes,
protocol_string,
username_string,
)
from nemo_text_processing.text_normalization.it.utils import get_abs_path

Expand All @@ -35,7 +40,7 @@
class ElectronicFst(GraphFst):
"""
Finite state transducer for verbalizing electronic
e.g. electronic { username: "abc.def2" domain: "studenti.università.it" } ->
e.g. electronic { username: "abc.def2" domain: "studenti.università.it" } ->
"a b c punto d e f due chiocciola s t u d e n t i punto u n i v e r s i t à punto IT
Args:
deterministic: if True will provide a single transduction option,
Expand All @@ -48,27 +53,36 @@ def __init__(self, deterministic: bool = True):
graph_digit = digit_no_zero | zero

def add_space_after_char():
return pynini.closure(NEMO_NOT_QUOTE - pynini.accep(" ") + insert_space) + (
NEMO_NOT_QUOTE - pynini.accep(" ")
return pynini.closure(NEMO_NOT_QUOTE - pynini.accep(NEMO_SPACE) + pynutil.insert(NEMO_SPACE)) + (
NEMO_NOT_QUOTE - pynini.accep(NEMO_SPACE)
)

verbalize_characters = pynini.cdrewrite(graph_symbols | graph_digit, "", "", NEMO_SIGMA)

user_name = pynutil.delete("username: \"") + add_space_after_char() + pynutil.delete("\"")
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

alias username as its own variable

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

user_name = (
pynutil.delete(username_string + colon + NEMO_SPACE + double_quotes)
+ add_space_after_char()
+ pynutil.delete(double_quotes)
)
user_name @= verbalize_characters

convert_defaults = pynutil.add_weight(NEMO_NOT_QUOTE, weight=0.0001) | server_common | domain_common
domain = convert_defaults + pynini.closure(insert_space + convert_defaults)
domain = convert_defaults + pynini.closure(pynutil.insert(NEMO_SPACE) + convert_defaults)
domain @= verbalize_characters

domain = pynutil.delete("domain: \"") + domain + pynutil.delete("\"")
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

alias domain as its own variable

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

domain = (
pynutil.delete(domain_string + colon + NEMO_SPACE + double_quotes) + domain + pynutil.delete(double_quotes)
)
protocol = (
pynutil.delete("protocol: \"")
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just alias protocol as its own variable

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

pynutil.delete(protocol_string + colon + NEMO_SPACE + double_quotes)
+ add_space_after_char() @ pynini.cdrewrite(graph_symbols, "", "", NEMO_SIGMA)
+ pynutil.delete("\"")
+ pynutil.delete(double_quotes)
)
self.graph = (pynini.closure(protocol + pynini.accep(" "), 0, 1) + domain) | (
user_name + pynini.accep(" ") + pynutil.insert("chiocciola ") + domain

self.graph = (pynini.closure(protocol + NEMO_SPACE, 0, 1) + domain) | (
user_name + NEMO_SPACE + pynutil.insert("chiocciola ") + domain
| (pynutil.insert("chiocciola ") + user_name)
)

delete_tokens = self.delete_tokens(self.graph + delete_preserve_order)
self.fst = delete_tokens.optimize()
Loading