Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __init__(self, lang: str, remove_non_printable_char: bool = True,
unicode_norm_form: Optional[str] = None):
self._remove_non_printable_char = remove_non_printable_char
self._moses_normalizer = MosesPunctNormalizer(lang)
self._unicode_norm_form = unicode_norm_form
if unicode_norm_form is not None:
assert unicode_norm_form in ['NFC', 'NFKC', 'NFD', 'NFKD'],\
'Unsupported unicode normalization format, you may refer to ' \
'https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize for ' \
'more details.'
self.__warmup()
def normalize_file(
language, processes, normalize_quote_commas, normalize_numbers,
replace_unicode_puncts, remove_control_chars, encoding, quiet
):
moses = MosesPunctNormalizer(
language,
norm_quote_commas=normalize_quote_commas,
norm_numbers=normalize_numbers,
pre_replace_unicode_punct=replace_unicode_puncts,
post_remove_control_chars=remove_control_chars,
)
moses_normalize = partial(moses.normalize)
with click.get_text_stream("stdin", encoding=encoding) as fin:
with click.get_text_stream("stdout", encoding=encoding) as fout:
# If it's single process, joblib parallization is slower,
# so just process line by line normally.
if processes == 1:
# TODO: Actually moses_normalize(fin.read()) gives the same output
# and it's a lot better but it's inconsistent with the other
# preprocessing interfaces, so we're doing it line by line here.
def normalize_file(
language, processes, normalize_quote_commas, normalize_numbers,
replace_unicode_puncts, remove_control_chars, encoding, quiet
):
moses = MosesPunctNormalizer(
language,
norm_quote_commas=normalize_quote_commas,
norm_numbers=normalize_numbers,
pre_replace_unicode_punct=replace_unicode_punct,
post_remove_control_chars=remove_control_chars,
)
moses_normalize = partial(moses.normalize)
with click.get_text_stream("stdin", encoding=encoding) as fin:
with click.get_text_stream("stdout", encoding=encoding) as fout:
# If it's single process, joblib parallization is slower,
# so just process line by line normally.
if processes == 1:
# TODO: Actually moses_normalize(fin.read()) gives the same output
# and it's a lot better but it's inconsistent with the other
# preprocessing interfaces, so we're doing it line by line here.