Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
L = regex.L
LOCALE = regex.LOCALE
M = regex.M
MULTILINE = regex.MULTILINE
R = regex.R
REVERSE = regex.REVERSE
S = regex.S
DOTALL = regex.DOTALL
U = regex.U
UNICODE = regex.UNICODE
X = regex.X
VERBOSE = regex.VERBOSE
V0 = regex.V0
VERSION0 = regex.VERSION0
V1 = regex.V1
VERSION1 = regex.VERSION1
W = regex.W
WORD = regex.WORD
DEFAULT_VERSION = regex.DEFAULT_VERSION
REGEX_TYPE = type(regex.compile('', 0))
escape = regex.escape
purge = regex.purge
utokens = {
"regex_flags": re.compile(
r'(?s)(\\.)|\(\?((?:[Laberux]|V0|V1|-?[imsfw])+)[):]|(.)'
),
"regex_search_ref": re.compile(
r'''(?x)
(\\)+
(
[(EQ]
from typing import List
import regex
from modify_text.modify_strings import org_types
from xml_extractions.extract_node_values import Offset
find_corp = regex.compile("(((?i)" + org_types + r") "
r"((?i)"
r"(de |le |la |les |pour |l'|et |en |des |d'|au |du )"
r")*"
r"((\()?[A-ZÉÈ&']+[\w\-'\.\)]*)"
r"( (de |le |la |les |pour |l'|et |en |des |d'|au |du |\(|& |/ ?|\- ?)*"
r"[A-ZÉÈ\-&']+[\w\-'\.\)]*"
r")*"
r")", flags=regex.VERSION1)
def get_company_names(text: str) -> List[Offset]:
"""
Extract company names from string text
:param text: original text
:return: a list of offsets
"""
return [Offset(start=t.start(),
end=t.end(),
type="ORGANIZATION_1") for t in find_corp.finditer(text)]
def __init__(self, container, do_embed=False):
if self.first_letter_pat is None:
StatsCollector.first_letter_pat = self.first_letter_pat = regex.compile(
r'^[\p{Ps}\p{Ps}\p{Pe}\p{Pi}\p{Pf}\p{Po}]+', regex.VERSION1 | regex.UNICODE)
StatsCollector.capitalize_pat = self.capitalize_pat = regex.compile(
r'[\p{L}\p{N}]', regex.VERSION1 | regex.UNICODE)
self.collect_font_stats(container, do_embed)
stack.extend(new_stack_elements)
stack.sort()
stack = stack[-max_stack_depth:]
logger.debug(
"added {}Â new stack elements, depth after trunc: {}".format(
len(new_stack_elements), len(stack)
)
)
except CTParseTimeoutError:
logger.debug('Timeout on "{}"'.format(txt))
return
# replace all comma, semicolon, whitespace, invisible control, opening and
# closing brackets
_repl1 = regex.compile(r"[,;\pZ\pC\p{Ps}\p{Pe}]+", regex.VERSION1)
_repl2 = regex.compile(r"(\p{Pd}|[\u2010-\u2015]|\u2043)+", regex.VERSION1)
def _preprocess_string(txt: str) -> str:
return cast(
str, _repl2.sub("-", _repl1.sub(" ", txt, concurrent=True).strip()).strip()
)
def _match_rule(
seq: Sequence[Artifact], rule: Sequence[Callable[[Artifact], bool]]
) -> Iterator[Tuple[int, int]]:
if not seq:
return
if not rule:
return
stack.sort()
stack = stack[-max_stack_depth:]
logger.debug(
"added {}Â new stack elements, depth after trunc: {}".format(
len(new_stack_elements), len(stack)
)
)
except CTParseTimeoutError:
logger.debug('Timeout on "{}"'.format(txt))
return
# replace all comma, semicolon, whitespace, invisible control, opening and
# closing brackets
_repl1 = regex.compile(r"[,;\pZ\pC\p{Ps}\p{Pe}]+", regex.VERSION1)
_repl2 = regex.compile(r"(\p{Pd}|[\u2010-\u2015]|\u2043)+", regex.VERSION1)
def _preprocess_string(txt: str) -> str:
return cast(
str, _repl2.sub("-", _repl1.sub(" ", txt, concurrent=True).strip()).strip()
)
def _match_rule(
seq: Sequence[Artifact], rule: Sequence[Callable[[Artifact], bool]]
) -> Iterator[Tuple[int, int]]:
if not seq:
return
if not rule:
return
i_r = 0
def __init__(self, fqdn_suffix=None):
self._suffix = normalize_fqdn_suffix(fqdn_suffix or DEFAULT_FQDN_SUFFIX)
self._re = regex.compile(
r'''^\s*
((?P\w+)\.)* # flags
((?P<var>\w+)-(?P\w+)\.)+ # variables
v(?P\w+)\. # version
{!s} # suffix
\s*$'''.format(regex.escape(self._suffix)),
regex.VERSION1 | regex.VERBOSE)
</var>
c = lambda x:regex.compile(x, flags=regex.VERSION1)
"""
return '|'.join(original_list)
date_pattern_in_letters = "(" + get_or_regex(un_trent_et_un) + ") (" + get_or_regex(months) + ") (" + \
get_or_regex(years) + "." + "(" + get_or_regex(un_trent_et_un) + ")?)"
date_pattern_in_letters_regex = regex.compile(date_pattern_in_letters,
flags=regex.VERSION1 | regex.IGNORECASE)
date_pattern_in_numbers_1 = r"[0-3]?\d( ?er)? (" + get_or_regex(months) + r") (19|20|20)?\d{2}"
date_pattern_in_numbers_regex_1 = regex.compile(date_pattern_in_numbers_1,
flags=regex.VERSION1 | regex.IGNORECASE)
date_pattern_in_numbers_regex_2 = regex.compile(r'(\d{1,2}.?(/|\-).?\d{1,2}.?(/|\-).?\d{2,4})',
flags=regex.VERSION1 | regex.IGNORECASE)
def get_date(text: str) -> List[Offset]:
"""
Parse text to retrieve offset mentioning a date
:param text: original text
:return: offsets as a list
"""
r1 = [Offset(t.start(), t.end(), "DATE_1") for t in date_pattern_in_letters_regex.finditer(text)]
r2 = [Offset(t.start(), t.end(), "DATE_1") for t in date_pattern_in_numbers_regex_1.finditer(text)]
r3 = [Offset(t.start(), t.end(), "DATE_1") for t in date_pattern_in_numbers_regex_2.finditer(text)]
return r1 + r2 + r3
def get_clerk_name(text: str) -> list:
"""
Extract clerk name from text
:param text: original paragraph text
:return: offsets as a list
"""
result1 = [(t.start(), t.end(), "GREFFIER") for t in extract_clerk_pattern_1.finditer(text)]
result2 = [(t.start(), t.end(), "GREFFIER") for t in extract_clerk_pattern_2.finditer(text)]
return result1 + result2
extract_lawyer = regex.compile("(?<=(Me|Me\.|(M|m)a(i|î)tre|M°) )"
"[A-ZÉÈ]+[\w-']*"
"( [A-ZÉÈ\-]+[\w-']*)*",
flags=regex.VERSION1)
def get_lawyer_name(text: str) -> list:
"""
Extract lawyer name from text
:param text: original paragraph text
:return: offsets as a list
"""
return [(t.start(), t.end(), "AVOCAT") for t in extract_lawyer.finditer(text)]
places_pattern = ("rue|chemin|boulevard|bd\.?|bld|av(\.|e)?|avenue|allée|quai|"
"(?
def compile_pat(pat):
import regex
REGEX_FLAGS = regex.VERSION1 | regex.WORD | regex.FULLCASE | regex.IGNORECASE | regex.UNICODE
return regex.compile(pat, flags=REGEX_FLAGS)