How to use the regex.Regex function in regex

To help you get started, we’ve selected a few regex examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github ufal / mtmonkey / worker / src / util / morphodita.py View on Github external
def __further_tokenize(self, text):
        # unescape
        text = Regex(r"&pipe;").sub(r"|", text)
        text = Regex(r"<").sub(r"<", text)
        text = Regex(r">").sub(r">", text)
        text = Regex(r"&").sub(r"&", text)

        text = Regex(r"([\p{N}\p{P}\p{S}])([\p{L}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{L}\p{P}\p{S}])([\p{N}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{L}\p{N}\p{S}])([\p{P}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{L}\p{N}\p{P}])([\p{S}])").sub(r"\1 \2", text)

        text = Regex(r"([\p{L}])([\p{N}\p{P}\p{S}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{N}])([\p{L}\p{P}\p{S}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{P}])([\p{L}\p{N}\p{S}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{S}])([\p{L}\p{N}\p{P}])").sub(r"\1 \2", text)

        # re-escape
        text = Regex(r"&").sub(r"&", text)
        text = Regex(r"\|").sub(r"&pipe;", text)
        text = Regex(r"<").sub(r"<", text)
        text = Regex(r">").sub(r">", text)

        return text
github ufal / mtmonkey / worker / src / util / split_sentences.py View on Github external
def __init__(self, options={}):
        """\
        Constructor (pre-compile all needed regexes).
        """
        # load no-break prefixes for the given language
        self.__load_nobreaks(options.get('language'),
                             options.get('nobreak_file'))
        # compile regexes
        self.__spaces = Regex(r'\s+')
        self.__space_at_end = Regex(r'(^|\n) ')
        self.__space_at_begin = Regex(r' ($|\n)')
        self.__non_period = Regex(r'([?!]|\.{2,}) +' + self.SENT_STARTER)
        self.__in_punct = Regex(r'([?!\.] *' + self.FINAL_PUNCT + r') +' +
                                self.SENT_STARTER)
        self.__punct_follows = Regex(r'([?!\.]) +' + self.SENT_STARTER_PUNCT)
        self.__period = Regex(r'([\p{Alnum}\.\-]+)(' + self.FINAL_PUNCT +
                              r')? *$')
        self.__ucase_acronym = Regex(r'\.[\p{Upper}\-]+$')
        self.__numbers = Regex(r'^\p{N}')
        self.__sent_starter = Regex(self.SENT_STARTER)
github ufal / mtmonkey / worker / src / util / tokenize.py View on Github external
self.moses_escape = True if options.get('moses_escape') else False
        # compile regexes
        self.__spaces = Regex(r'\s+', flags=UNICODE)
        self.__ascii_junk = Regex(r'[\000-\037]')
        self.__special_chars = \
                Regex(r'(([^\p{IsAlnum}\s\.\,−\-])\2*)')
        # single quotes: all unicode quotes + prime
        self.__to_single_quotes = Regex(r'[`‛‚‘’‹›′]')
        # double quotes: all unicode chars incl. Chinese + double prime + ditto
        self.__to_double_quotes = Regex(r'(\'\'|``|[«»„‟“”″〃「」『』〝〞〟])')
        self.__no_numbers = Regex(r'([^\p{N}])([,.])([^\p{N}])')
        self.__pre_numbers = Regex(r'([^\p{N}])([,.])([\p{N}])')
        self.__post_numbers = Regex(r'([\p{N}])([,.])([^\p{N}])')
        # hyphen: separate every time but for unary minus
        self.__minus = Regex(r'([-−])')
        self.__pre_notnum = Regex(r'(-)([^\p{N}])')
        self.__post_num_or_nospace = Regex(r'(\p{N} *|[^ ])(-)')
github ufal / mtmonkey / worker / src / util / split_sentences.py View on Github external
# load no-break prefixes for the given language
        self.__load_nobreaks(options.get('language'),
                             options.get('nobreak_file'))
        # compile regexes
        self.__spaces = Regex(r'\s+')
        self.__space_at_end = Regex(r'(^|\n) ')
        self.__space_at_begin = Regex(r' ($|\n)')
        self.__non_period = Regex(r'([?!]|\.{2,}) +' + self.SENT_STARTER)
        self.__in_punct = Regex(r'([?!\.] *' + self.FINAL_PUNCT + r') +' +
                                self.SENT_STARTER)
        self.__punct_follows = Regex(r'([?!\.]) +' + self.SENT_STARTER_PUNCT)
        self.__period = Regex(r'([\p{Alnum}\.\-]+)(' + self.FINAL_PUNCT +
                              r')? *$')
        self.__ucase_acronym = Regex(r'\.[\p{Upper}\-]+$')
        self.__numbers = Regex(r'^\p{N}')
        self.__sent_starter = Regex(self.SENT_STARTER)
github ufal / mtmonkey / worker / src / util / morphodita.py View on Github external
def __further_tokenize(self, text):
        # unescape
        text = Regex(r"&pipe;").sub(r"|", text)
        text = Regex(r"<").sub(r"<", text)
        text = Regex(r">").sub(r">", text)
        text = Regex(r"&").sub(r"&", text)

        text = Regex(r"([\p{N}\p{P}\p{S}])([\p{L}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{L}\p{P}\p{S}])([\p{N}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{L}\p{N}\p{S}])([\p{P}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{L}\p{N}\p{P}])([\p{S}])").sub(r"\1 \2", text)

        text = Regex(r"([\p{L}])([\p{N}\p{P}\p{S}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{N}])([\p{L}\p{P}\p{S}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{P}])([\p{L}\p{N}\p{S}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{S}])([\p{L}\p{N}\p{P}])").sub(r"\1 \2", text)

        # re-escape
        text = Regex(r"&").sub(r"&", text)
        text = Regex(r"\|").sub(r"&pipe;", text)
        text = Regex(r"<").sub(r"<", text)
        text = Regex(r">").sub(r">", text)

        return text
github ufal / mtmonkey / worker / src / util / morphodita.py View on Github external
def __further_tokenize(self, text):
        # unescape
        text = Regex(r"&pipe;").sub(r"|", text)
        text = Regex(r"<").sub(r"<", text)
        text = Regex(r">").sub(r">", text)
        text = Regex(r"&").sub(r"&", text)

        text = Regex(r"([\p{N}\p{P}\p{S}])([\p{L}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{L}\p{P}\p{S}])([\p{N}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{L}\p{N}\p{S}])([\p{P}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{L}\p{N}\p{P}])([\p{S}])").sub(r"\1 \2", text)

        text = Regex(r"([\p{L}])([\p{N}\p{P}\p{S}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{N}])([\p{L}\p{P}\p{S}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{P}])([\p{L}\p{N}\p{S}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{S}])([\p{L}\p{N}\p{P}])").sub(r"\1 \2", text)

        # re-escape
        text = Regex(r"&").sub(r"&", text)
        text = Regex(r"\|").sub(r"&pipe;", text)
        text = Regex(r"<").sub(r"<", text)
        text = Regex(r">").sub(r">", text)

        return text
github ufal / mtmonkey / worker / src / util / morphodita.py View on Github external
text = Regex(r"([\p{N}\p{P}\p{S}])([\p{L}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{L}\p{P}\p{S}])([\p{N}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{L}\p{N}\p{S}])([\p{P}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{L}\p{N}\p{P}])([\p{S}])").sub(r"\1 \2", text)

        text = Regex(r"([\p{L}])([\p{N}\p{P}\p{S}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{N}])([\p{L}\p{P}\p{S}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{P}])([\p{L}\p{N}\p{S}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{S}])([\p{L}\p{N}\p{P}])").sub(r"\1 \2", text)

        # re-escape
        text = Regex(r"&").sub(r"&", text)
        text = Regex(r"\|").sub(r"&pipe;", text)
        text = Regex(r"<").sub(r"<", text)
        text = Regex(r">").sub(r">", text)

        return text
github ufal / mtmonkey / worker / src / util / morphodita.py View on Github external
def __further_tokenize(self, text):
        # unescape
        text = Regex(r"&pipe;").sub(r"|", text)
        text = Regex(r"<").sub(r"<", text)
        text = Regex(r">").sub(r">", text)
        text = Regex(r"&").sub(r"&", text)

        text = Regex(r"([\p{N}\p{P}\p{S}])([\p{L}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{L}\p{P}\p{S}])([\p{N}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{L}\p{N}\p{S}])([\p{P}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{L}\p{N}\p{P}])([\p{S}])").sub(r"\1 \2", text)

        text = Regex(r"([\p{L}])([\p{N}\p{P}\p{S}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{N}])([\p{L}\p{P}\p{S}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{P}])([\p{L}\p{N}\p{S}])").sub(r"\1 \2", text)
        text = Regex(r"([\p{S}])([\p{L}\p{N}\p{P}])").sub(r"\1 \2", text)

        # re-escape
        text = Regex(r"&").sub(r"&", text)
        text = Regex(r"\|").sub(r"&pipe;", text)
        text = Regex(r"<").sub(r"<", text)
        text = Regex(r">").sub(r">", text)

        return text
github ufal / mtmonkey / worker / src / util / tokenize.py View on Github external
Constructor (pre-compile all needed regexes).
        """
        # process options
        self.lowercase = True if options.get('lowercase') else False
        self.moses_escape = True if options.get('moses_escape') else False
        # compile regexes
        self.__spaces = Regex(r'\s+', flags=UNICODE)
        self.__ascii_junk = Regex(r'[\000-\037]')
        self.__special_chars = \
                Regex(r'(([^\p{IsAlnum}\s\.\,−\-])\2*)')
        # single quotes: all unicode quotes + prime
        self.__to_single_quotes = Regex(r'[`‛‚‘’‹›′]')
        # double quotes: all unicode chars incl. Chinese + double prime + ditto
        self.__to_double_quotes = Regex(r'(\'\'|``|[«»„‟“”″〃「」『』〝〞〟])')
        self.__no_numbers = Regex(r'([^\p{N}])([,.])([^\p{N}])')
        self.__pre_numbers = Regex(r'([^\p{N}])([,.])([\p{N}])')
        self.__post_numbers = Regex(r'([\p{N}])([,.])([^\p{N}])')
        # hyphen: separate every time but for unary minus
        self.__minus = Regex(r'([-−])')
        self.__pre_notnum = Regex(r'(-)([^\p{N}])')
        self.__post_num_or_nospace = Regex(r'(\p{N} *|[^ ])(-)')