How to use the sacremoses.tokenize.MosesTokenizer function in sacremoses

To help you get started, we’ve selected a few sacremoses examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github alvations / sacremoses / sacremoses / cli.py View on Github external
def tokenize_file(
    language,
    processes,
    xml_escape,
    aggressive_dash_splits,
    protected_patterns,
    custom_nb_prefixes,
    encoding,
    quiet
):
    moses = MosesTokenizer(lang=language,
        custom_nonbreaking_prefixes_file=custom_nb_prefixes)

    if protected_patterns:
        with open(protected_patterns, encoding="utf8") as fin:
            protected_patterns = [pattern.strip() for pattern in fin.readlines()]

    moses_tokenize = partial(
        moses.tokenize,
        return_str=True,
        aggressive_dash_splits=aggressive_dash_splits,
        escape=xml_escape,
        protected_patterns=protected_patterns,
    )

    with click.get_text_stream("stdin", encoding=encoding) as fin:
        with click.get_text_stream("stdout", encoding=encoding) as fout:
github alvations / sacremoses / sacremoses / cli.py View on Github external
def tokenize_file(
    language,
    processes,
    xml_escape,
    aggressive_dash_splits,
    protected_patterns,
    custom_nb_prefixes,
    encoding,
    quiet
):
    moses = MosesTokenizer(lang=language,
        custom_nonbreaking_prefixes_file=custom_nb_prefixes)

    if protected_patterns:
        with open(protected_patterns, encoding="utf8") as fin:
            protected_patterns = [pattern.strip() for pattern in fin.readlines()]

    moses_tokenize = partial(
        moses.tokenize,
        return_str=True,
        aggressive_dash_splits=aggressive_dash_splits,
        escape=xml_escape,
        protected_patterns=protected_patterns,
    )

    with click.get_text_stream("stdin", encoding=encoding) as fin:
        with click.get_text_stream("stdout", encoding=encoding) as fout:
github alvations / sacremoses / sacremoses / tokenize.py View on Github external
def __init__(self, lang="en", custom_nonbreaking_prefixes_file=None):
        # Initialize the object.
        super(MosesTokenizer, self).__init__()
        self.lang = lang

        # Initialize the language specific nonbreaking prefixes.
        self.NONBREAKING_PREFIXES = [
            _nbp.strip() for _nbp in nonbreaking_prefixes.words(lang)
        ]

        # Load custom nonbreaking prefixes file.
        if custom_nonbreaking_prefixes_file:
            self.NONBREAKING_PREFIXES  = []
            with open(custom_nonbreaking_prefixes_file, 'r') as fin:
                for line in fin:
                    line = line.strip()
                    if line and not line.startswith("#"):
                        if line not in self.NONBREAKING_PREFIXES:
                            self.NONBREAKING_PREFIXES.append(line)