How to use the sacremoses.corpus.NonbreakingPrefixes function in sacremoses

To help you get started, we’ve selected a few sacremoses examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github alvations / sacremoses / sacremoses / sent_tokenize.py View on Github external
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re

from six import text_type

from sacremoses.corpus import Perluniprops
from sacremoses.corpus import NonbreakingPrefixes
from sacremoses.util import is_cjk

perluniprops = Perluniprops()
nonbreaking_prefixes = NonbreakingPrefixes()


class MosesSentTokenizer(object):
    """
    This is a Python port of the Moses Tokenizer from
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/ems/support/split-sentences.perl
    """
    raise NotImplementedError

    r"""
    # Perl Unicode Properties character sets.
github alvations / sacremoses / sacremoses / tokenize.py View on Github external
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re

from six import text_type

from sacremoses.corpus import Perluniprops
from sacremoses.corpus import NonbreakingPrefixes
from sacremoses.util import is_cjk

perluniprops = Perluniprops()
nonbreaking_prefixes = NonbreakingPrefixes()


class MosesTokenizer(object):
    """
    This is a Python port of the Moses Tokenizer from
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
    """

    # Perl Unicode Properties character sets.
    IsN = text_type("".join(perluniprops.chars("IsN")))
    IsAlnum = text_type("".join(perluniprops.chars("IsAlnum")))  # + u'्'
    IsSc = text_type("".join(perluniprops.chars("IsSc")))
    IsSo = text_type("".join(perluniprops.chars("IsSo")))
    IsAlpha = text_type("".join(perluniprops.chars("IsAlpha")))
    IsLower = text_type("".join(perluniprops.chars("IsLower")))