How to use the regex.VERSION1 function in regex

To help you get started, we’ve selected a few regex examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github facelessuser / BracketHighlighter / backrefs / bregex.py View on Github external
L = regex.L
    LOCALE = regex.LOCALE
    M = regex.M
    MULTILINE = regex.MULTILINE
    R = regex.R
    REVERSE = regex.REVERSE
    S = regex.S
    DOTALL = regex.DOTALL
    U = regex.U
    UNICODE = regex.UNICODE
    X = regex.X
    VERBOSE = regex.VERBOSE
    V0 = regex.V0
    VERSION0 = regex.VERSION0
    V1 = regex.V1
    VERSION1 = regex.VERSION1
    W = regex.W
    WORD = regex.WORD
    DEFAULT_VERSION = regex.DEFAULT_VERSION
    REGEX_TYPE = type(regex.compile('', 0))
    escape = regex.escape
    purge = regex.purge

    utokens = {
        "regex_flags": re.compile(
            r'(?s)(\\.)|\(\?((?:[Laberux]|V0|V1|-?[imsfw])+)[):]|(.)'
        ),
        "regex_search_ref": re.compile(
            r'''(?x)
            (\\)+
            (
                [(EQ]
github ELS-RD / anonymisation / match_text / match_company_names.py View on Github external
from typing import List

import regex

from modify_text.modify_strings import org_types
from xml_extractions.extract_node_values import Offset

find_corp = regex.compile("(((?i)" + org_types + r") "
                                                 r"((?i)"
                                                 r"(de |le |la |les |pour |l'|et |en |des |d'|au |du )"
                                                 r")*"
                                                 r"((\()?[A-ZÉÈ&']+[\w\-'\.\)]*)"
                                                 r"( (de |le |la |les |pour |l'|et |en |des |d'|au |du |\(|& |/ ?|\- ?)*"
                                                 r"[A-ZÉÈ\-&']+[\w\-'\.\)]*"
                                                 r")*"
                                                 r")", flags=regex.VERSION1)


def get_company_names(text: str) -> List[Offset]:
    """
    Extract company names from string text
    :param text: original text
    :return: a list of offsets
    """
    return [Offset(start=t.start(),
                   end=t.end(),
                   type="ORGANIZATION_1") for t in find_corp.finditer(text)]
github norbusan / calibre-debian / src / calibre / ebooks / oeb / polish / stats.py View on Github external
def __init__(self, container, do_embed=False):
        if self.first_letter_pat is None:
            StatsCollector.first_letter_pat = self.first_letter_pat = regex.compile(
                r'^[\p{Ps}\p{Ps}\p{Pe}\p{Pi}\p{Pf}\p{Po}]+', regex.VERSION1 | regex.UNICODE)
            StatsCollector.capitalize_pat = self.capitalize_pat = regex.compile(
                r'[\p{L}\p{N}]', regex.VERSION1 | regex.UNICODE)

        self.collect_font_stats(container, do_embed)
github comtravo / ctparse / ctparse / ctparse.py View on Github external
stack.extend(new_stack_elements)
                stack.sort()
                stack = stack[-max_stack_depth:]
                logger.debug(
                    "added {} new stack elements, depth after trunc: {}".format(
                        len(new_stack_elements), len(stack)
                    )
                )
    except CTParseTimeoutError:
        logger.debug('Timeout on "{}"'.format(txt))
        return


# replace all comma, semicolon, whitespace, invisible control, opening and
# closing brackets
_repl1 = regex.compile(r"[,;\pZ\pC\p{Ps}\p{Pe}]+", regex.VERSION1)
_repl2 = regex.compile(r"(\p{Pd}|[\u2010-\u2015]|\u2043)+", regex.VERSION1)


def _preprocess_string(txt: str) -> str:
    return cast(
        str, _repl2.sub("-", _repl1.sub(" ", txt, concurrent=True).strip()).strip()
    )


def _match_rule(
    seq: Sequence[Artifact], rule: Sequence[Callable[[Artifact], bool]]
) -> Iterator[Tuple[int, int]]:
    if not seq:
        return
    if not rule:
        return
github comtravo / ctparse / ctparse / ctparse.py View on Github external
stack.sort()
                stack = stack[-max_stack_depth:]
                logger.debug(
                    "added {} new stack elements, depth after trunc: {}".format(
                        len(new_stack_elements), len(stack)
                    )
                )
    except CTParseTimeoutError:
        logger.debug('Timeout on "{}"'.format(txt))
        return


# replace all comma, semicolon, whitespace, invisible control, opening and
# closing brackets
_repl1 = regex.compile(r"[,;\pZ\pC\p{Ps}\p{Pe}]+", regex.VERSION1)
_repl2 = regex.compile(r"(\p{Pd}|[\u2010-\u2015]|\u2043)+", regex.VERSION1)


def _preprocess_string(txt: str) -> str:
    return cast(
        str, _repl2.sub("-", _repl1.sub(" ", txt, concurrent=True).strip()).strip()
    )


def _match_rule(
    seq: Sequence[Artifact], rule: Sequence[Callable[[Artifact], bool]]
) -> Iterator[Tuple[int, int]]:
    if not seq:
        return
    if not rule:
        return
    i_r = 0
github toukoaozaki / vpnoverdns-reassembler / vodreassembler / protocol.py View on Github external
def __init__(self, fqdn_suffix=None):
    self._suffix = normalize_fqdn_suffix(fqdn_suffix or DEFAULT_FQDN_SUFFIX)
    self._re = regex.compile(
        r'''^\s*
              ((?P\w+)\.)*                # flags
              ((?P<var>\w+)-(?P\w+)\.)+  # variables
              v(?P\w+)\.               # version
              {!s}                              # suffix
            \s*$'''.format(regex.escape(self._suffix)),
        regex.VERSION1 | regex.VERBOSE)
</var>
github ELS-RD / anonymisation / match_text / match_date.py View on Github external
"""
    return '|'.join(original_list)


date_pattern_in_letters = "(" + get_or_regex(un_trent_et_un) + ") (" + get_or_regex(months) + ") (" + \
                          get_or_regex(years) + "." + "(" + get_or_regex(un_trent_et_un) + ")?)"

date_pattern_in_letters_regex = regex.compile(date_pattern_in_letters,
                                              flags=regex.VERSION1 | regex.IGNORECASE)

date_pattern_in_numbers_1 = r"[0-3]?\d( ?er)? (" + get_or_regex(months) + r") (19|20|20)?\d{2}"
date_pattern_in_numbers_regex_1 = regex.compile(date_pattern_in_numbers_1,
                                                flags=regex.VERSION1 | regex.IGNORECASE)

date_pattern_in_numbers_regex_2 = regex.compile(r'(\d{1,2}.?(/|\-).?\d{1,2}.?(/|\-).?\d{2,4})',
                                                flags=regex.VERSION1 | regex.IGNORECASE)


def get_date(text: str) -> List[Offset]:
    """
    Parse text to retrieve offset mentioning a date
    :param text: original text
    :return: offsets as a list
    """
    r1 = [Offset(t.start(), t.end(), "DATE_1") for t in date_pattern_in_letters_regex.finditer(text)]
    r2 = [Offset(t.start(), t.end(), "DATE_1") for t in date_pattern_in_numbers_regex_1.finditer(text)]
    r3 = [Offset(t.start(), t.end(), "DATE_1") for t in date_pattern_in_numbers_regex_2.finditer(text)]
    return r1 + r2 + r3
github ELS-RD / anonymisation / generate_trainset / match_patterns.py View on Github external
def get_clerk_name(text: str) -&gt; list:
    """
    Extract clerk name from text
    :param text: original paragraph text
    :return: offsets as a list
    """
    result1 = [(t.start(), t.end(), "GREFFIER") for t in extract_clerk_pattern_1.finditer(text)]
    result2 = [(t.start(), t.end(), "GREFFIER") for t in extract_clerk_pattern_2.finditer(text)]
    return result1 + result2


extract_lawyer = regex.compile("(?&lt;=(Me|Me\.|(M|m)a(i|î)tre|M°) )"
                               "[A-ZÉÈ]+[\w-']*"
                               "( [A-ZÉÈ\-]+[\w-']*)*",
                               flags=regex.VERSION1)


def get_lawyer_name(text: str) -&gt; list:
    """
    Extract lawyer name from text
    :param text: original paragraph text
    :return: offsets as a list
    """
    return [(t.start(), t.end(), "AVOCAT") for t in extract_lawyer.finditer(text)]


places_pattern = ("rue|chemin|boulevard|bd\.?|bld|av(\.|e)?|avenue|allée|quai|"
                  "(?
github kovidgoyal / calibre / src / calibre / ebooks / metadata / author_mapper.py View on Github external
def compile_pat(pat):
    import regex
    REGEX_FLAGS = regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.IGNORECASE | regex.UNICODE
    return regex.compile(pat, flags=REGEX_FLAGS)