How to use the regex.VERBOSE function in regex

To help you get started, we’ve selected a few regex examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github facelessuser / backrefs / tests / test_bregex.py View on Github external
def test_infinite_loop_catch(self):
        """Test infinite loop catch."""

        with pytest.raises(_bregex_parse.LoopException):
            bregex.compile_search(r'(?-x:(?x))', regex.V0 | regex.VERBOSE)

        with pytest.raises(_bregex_parse.LoopException):
            bregex.compile_search(r'(?V1)(?V0)')
github tsproisl / SoMaJo / somajo / tokenizer.py View on Github external
# 202B), l-t-r/r-t-l override (202D, 202E), pop directional
        # formatting (202C), zero-width no-break space (FEFF)
        self.other_nasties = re.compile(r"[\u00AD\u061C\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF]")

        # TAGS, EMAILS, URLs
        self.xml_declaration = re.compile(r"""<\?xml
                                              (?:                #   This group permits zero or more attributes
                                                \s+              #   Whitespace to separate attributes
                                                [_:A-Z][-.:\w]*  #   Attribute name
                                                \s*=\s*          #   Attribute name-value delimiter
                                                (?: "[^"]*"      #   Double-quoted attribute value
                                                  | '[^']*'      #   Single-quoted attribute value
                                                )
                                              )*
                                              \s*                #   Permit trailing whitespace
                                              \?>""", re.VERBOSE | re.IGNORECASE)
        # self.tag = re.compile(r'<(?!-)(?:/[^> ]+|[^>]+/?)(?')
        # taken from Regular Expressions Cookbook
        self.tag = re.compile(r"""
                                  <
                                  (?:                  # Branch for opening tags:
                                    ([_:A-Z][-.:\w]*)  #   Capture the opening tag name to backreference 1
                                    (?:                #   This group permits zero or more attributes
                                      \s+              #   Whitespace to separate attributes
                                      [_:A-Z][-.:\w]*  #   Attribute name
                                      \s*=\s*          #   Attribute name-value delimiter
                                      (?: "[^"]*"      #   Double-quoted attribute value
                                        | '[^']*'      #   Single-quoted attribute value
                                      )
                                    )*
                                    \s*                #   Permit trailing whitespace
                                    /?                 #   Permit self-closed tags
github nameko / nameko / nameko / cli / main.py View on Github external
)
        (?:                 # non capturing optional group for value
            :               # match :
            (               # 2nd capturing group: default value
                (?:         # non capturing group for OR
                    [^{}]   # any non bracket
                |           # OR
                    \{      # literal {
                    (?2)    # recursive 2nd capturing group aka ([^{}]|{(?2)})
                    \}      # literal }
                )*          #
            )
        )?
        \}                  # end of macher }
        """,
        regex.VERBOSE
    )

IMPLICIT_ENV_VAR_MATCHER = re.compile(
    r"""
        .*          # matches any number of any characters
        \$\{.*\}    # matches any number of any characters
                    # between `${` and `}` literally
        .*          # matches any number of any characters
    """, re.VERBOSE
)


def setup_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-v',
github Sefaria / Sefaria-Project / sefaria / library.py View on Github external
node = self.get_title_node(title, lang)
        if bracket:
            #look behind for opening brace ({, and ahead for closing brace })
            re_string = ur"""(?<=							# look behind for opening brace
				[({]										# literal '(', brace,
				[^})]*										# anything but a closing ) or brace
			)
            """ + title + node.delimiter_re + node.regex(lang) + ur"""
            (?=												# look ahead for closing brace
				[^({]*										# match of anything but an opening '(' or brace
				[)}]										# zero-width: literal ')' or brace
			)"""
        else:
            re_string = '^' + title + node.delimiter_re + node.regex(lang)
        return regex.compile(re_string, regex.VERBOSE)  # Uses regex instead of re2 for the more intricate regexes at this stage.
github fnl / segtok / segtok / segmenter.py View on Github external
|   [Gg]eneral
        |   [Mm](?:ag)?is(?:ter|s)
        |   [Pp]rofessor
        |   [Ss]e\u00F1or(?:it)?a?
        ) \s
    # 4.b. if they are most likely part of an author list: (avoiding "...A and B")
    |   (?: (?10%):
# after, though, upon, while, yet
#
# Words hardly used after abbrevs vs. SSs (poor continuations, <2%):
# [after], as, at, but, during, for, in, nor, on, to, [though], [upon],
# whereas, [while], within, [yet]
github nltk / nltk / nltk / tokenize / casual.py View on Github external
(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace.
    """,
)

######################################################################
# This is the core tokenizing regex:

WORD_RE = regex.compile(r"""(%s)""" % "|".join(REGEXPS), regex.VERBOSE | regex.I | regex.UNICODE)

# WORD_RE performs poorly on these patterns:
HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")

# The emoticon string gets its own regex so that we can preserve case for
# them as needed:
EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE)

# These are for regularizing HTML entities to Unicode:
ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")


######################################################################
# Functions for converting html entities
######################################################################
github InQuest / python-iocextract / iocextract.py View on Github external
# Any number of defang characters.
            (?:
                \x20|
                """ + SEPARATOR_DEFANGS + r"""
            )*

            # Domain/path characters.
            \w
            \S+?

            # CISCO ESA style defangs followed by domain/path characters.
            (?:\x20[\/\.][^\.\/\s]\S*?)*
        )
    """ + END_PUNCTUATION + r"""
        (?=\s|$)
    """, re.IGNORECASE | re.VERBOSE | re.UNICODE)

# Get some obfuscated urls, main anchor is brackets around the period.
BRACKET_URL_RE = re.compile(r"""
        \b
        (
            [\.\:\/\\\w\[\]\(\)-]+
            (?:
                \x20?
                [\(\[]
                \x20?
                \.
                \x20?
                [\]\)]
                \x20?
                \S*?
            )+
github facelessuser / Rummage / rummage / rummage / rumcore / backrefs / bregex.py View on Github external
F = regex.F
    FULLCASE = regex.FULLCASE
    I = regex.I
    IGNORECASE = regex.IGNORECASE
    L = regex.L
    LOCALE = regex.LOCALE
    M = regex.M
    MULTILINE = regex.MULTILINE
    R = regex.R
    REVERSE = regex.REVERSE
    S = regex.S
    DOTALL = regex.DOTALL
    U = regex.U
    UNICODE = regex.UNICODE
    X = regex.X
    VERBOSE = regex.VERBOSE
    V0 = regex.V0
    VERSION0 = regex.VERSION0
    V1 = regex.V1
    VERSION1 = regex.VERSION1
    W = regex.W
    WORD = regex.WORD
    P = regex.P
    POSIX = regex.POSIX
    DEFAULT_VERSION = regex.DEFAULT_VERSION
    REGEX_TYPE = type(regex.compile('', 0))
    escape = regex.escape
    purge = regex.purge

    utokens = {
        "regex_flags": re.compile(
            r'(?s)(\\.)|\(\?((?:[Laberuxp]|V0|V1|-?[imsfw])+)[):]|(.)'
github LuminosoInsight / exquisite-corpus / exquisite_corpus / preprocess.py View on Github external
)
    \)
''', regex.VERBOSE)

# This regex matches Markdown formatting such as _italic_, **bold**, or
# ~strikethrough~, and extracts the text inside it as \2.
MARKDOWN_FORMAT_RES = [
    regex.compile(rf"""
        (?
github fnl / segtok / segtok / tokenizer.py View on Github external
def match_decorator(fn):
        automaton = compile(regex, UNICODE | VERBOSE)
        fn.split = automaton.split
        fn.match = automaton.match
        return fn