How to use the sacremoses.normalize.MosesPunctNormalizer function in sacremoses

To help you get started, we’ve selected a few sacremoses examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dmlc / gluon-nlp / src / gluonnlp / data / filtering.py View on Github external
def __init__(self, lang: str, remove_non_printable_char: bool = True,
                 unicode_norm_form: Optional[str] = None):
        self._remove_non_printable_char = remove_non_printable_char
        self._moses_normalizer = MosesPunctNormalizer(lang)
        self._unicode_norm_form = unicode_norm_form
        if unicode_norm_form is not None:
            assert unicode_norm_form in ['NFC', 'NFKC', 'NFD', 'NFKD'],\
                'Unsupported unicode normalization format, you may refer to ' \
                'https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize for ' \
                'more details.'
        self.__warmup()
github alvations / sacremoses / sacremoses / cli.py View on Github external
def normalize_file(
    language, processes, normalize_quote_commas, normalize_numbers,
    replace_unicode_puncts, remove_control_chars, encoding, quiet
):
    moses = MosesPunctNormalizer(
        language,
        norm_quote_commas=normalize_quote_commas,
        norm_numbers=normalize_numbers,
        pre_replace_unicode_punct=replace_unicode_puncts,
        post_remove_control_chars=remove_control_chars,
    )
    moses_normalize = partial(moses.normalize)

    with click.get_text_stream("stdin", encoding=encoding) as fin:
        with click.get_text_stream("stdout", encoding=encoding) as fout:
            # If it's single process, joblib parallization is slower,
            # so just process line by line normally.
            if processes == 1:
                # TODO: Actually moses_normalize(fin.read()) gives the same output
                #       and it's a lot better but it's inconsistent with the other
                #       preprocessing interfaces, so we're doing it line by line here.
github alvations / sacremoses / sacremoses / cli.py View on Github external
def normalize_file(
    language, processes, normalize_quote_commas, normalize_numbers,
    replace_unicode_puncts, remove_control_chars, encoding, quiet
):
    moses = MosesPunctNormalizer(
        language,
        norm_quote_commas=normalize_quote_commas,
        norm_numbers=normalize_numbers,
        pre_replace_unicode_punct=replace_unicode_punct,
        post_remove_control_chars=remove_control_chars,
    )
    moses_normalize = partial(moses.normalize)

    with click.get_text_stream("stdin", encoding=encoding) as fin:
        with click.get_text_stream("stdout", encoding=encoding) as fout:
            # If it's single process, joblib parallization is slower,
            # so just process line by line normally.
            if processes == 1:
                # TODO: Actually moses_normalize(fin.read()) gives the same output
                #       and it's a lot better but it's inconsistent with the other
                #       preprocessing interfaces, so we're doing it line by line here.