How to use the sacremoses.corpus.Perluniprops function in sacremoses

To help you get started, we’ve selected a few sacremoses examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github alvations / sacremoses / sacremoses / sent_tokenize.py View on Github external
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re

from six import text_type

from sacremoses.corpus import Perluniprops
from sacremoses.corpus import NonbreakingPrefixes
from sacremoses.util import is_cjk

perluniprops = Perluniprops()
nonbreaking_prefixes = NonbreakingPrefixes()


class MosesSentTokenizer(object):
    """
    This is a Python port of the Moses Tokenizer from
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/ems/support/split-sentences.perl
    """
    raise NotImplementedError

    r"""
    # Perl Unicode Properties character sets.
github alvations / sacremoses / sacremoses / tokenize.py View on Github external
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re

from six import text_type

from sacremoses.corpus import Perluniprops
from sacremoses.corpus import NonbreakingPrefixes
from sacremoses.util import is_cjk

perluniprops = Perluniprops()
nonbreaking_prefixes = NonbreakingPrefixes()


class MosesTokenizer(object):
    """
    This is a Python port of the Moses Tokenizer from
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
    """

    # Perl Unicode Properties character sets.
    IsN = text_type("".join(perluniprops.chars("IsN")))
    IsAlnum = text_type("".join(perluniprops.chars("IsAlnum")))  # + u'्'
    IsSc = text_type("".join(perluniprops.chars("IsSc")))
    IsSo = text_type("".join(perluniprops.chars("IsSo")))
    IsAlpha = text_type("".join(perluniprops.chars("IsAlpha")))
    IsLower = text_type("".join(perluniprops.chars("IsLower")))
github alvations / sacremoses / sacremoses / truecase.py View on Github external
# Hack to enable Python2.7 to use encoding.
import sys

if sys.version_info[0] < 3:
    import io
    import warnings

    open = io.open
    warnings.warn(
        str(
            "You should really be using Python3!!! "
            "Tick tock, tick tock, https://pythonclock.org/"
        )
    )

perluniprops = Perluniprops()


class MosesTruecaser(object):
    """
    This is a Python port of the Moses Truecaser from
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/train-truecaser.perl
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/recaser/truecase.perl
    """

    # Perl Unicode Properties character sets.
    Lowercase_Letter = text_type("".join(perluniprops.chars("Lowercase_Letter")))
    Uppercase_Letter = text_type("".join(perluniprops.chars("Uppercase_Letter")))
    Titlecase_Letter = text_type("".join(perluniprops.chars("Uppercase_Letter")))

    def __init__(self, load_from=None, is_asr=None, encoding="utf8"):
        """