How to use the regex.U function in regex

To help you get started, we’ve selected a few regex examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scrapinghub / dateparser / dateparser / View on Github external
from __future__ import unicode_literals

import regex as re
from datetime import datetime
from datetime import time
from tzlocal import get_localzone

from dateutil.relativedelta import relativedelta

from dateparser.utils import apply_timezone, localize_timezone, strip_braces
from .parser import time_parser
from .timezone_parser import pop_tz_offset_from_string

_UNITS = r'year|month|week|day|hour|minute|second'
PATTERN = re.compile(r'(\d+)\s*(%s)\b' % _UNITS, re.I | re.S | re.U)

class FreshnessDateDataParser(object):
    """ Parses date string like "1 year, 2 months ago" and "3 hours, 50 minutes ago" """
    def __init__(self): = None

    def _are_all_words_units(self, date_string):
        skip = [_UNITS,

        date_string = re.sub(r'\s+', ' ', date_string.strip())

        words = filter(lambda x: x if x else False, re.split(r'\W', date_string))
        words = filter(lambda x: not re.match(r'%s' % '|'.join(skip), x), words)
github urduhack / urduhack / urduhack / normalization / space / View on Github external
# coding: utf8
"""Space utils"""
import regex as re

from urduhack.urdu_characters import URDU_ALL_CHARACTERS, URDU_PUNCTUATIONS

# Add spaces before|after numeric number and urdu words
# 18سالہ  , 20فیصد
SPACE_BEFORE_DIGITS_RE = re.compile(r"(?<=[" + "".join(URDU_ALL_CHARACTERS) + "])(?=[0-9])", flags=re.U | re.M | re.I)
SPACE_AFTER_DIGITS_RE = re.compile(r"(?<=[0-9])(?=[" + "".join(URDU_ALL_CHARACTERS) + "])", flags=re.U | re.M | re.I)
# Add spaces after ., if there is number then not Ex (9.00)
        r"(?<=[" + "".join(URDU_PUNCTUATIONS) + "])(?=[^" + "".join(URDU_PUNCTUATIONS) + "0-9 ])",
        flags=re.U | re.M | re.I)

def digits_space(text: str) -> str:
    Add spaces before|after numeric and urdu digits

        text (str): text

    text = SPACE_BEFORE_DIGITS_RE.sub(' ', text)
    text = SPACE_AFTER_DIGITS_RE.sub(' ', text)

    return text
github scrapinghub / dateparser / scripts / View on Github external
    u'\N{RIGHT SINGLE QUOTATION MARK}',     # u'\u2019'
    u'\N{MODIFIER LETTER APOSTROPHE}',      # u'\u02bc'
    u'\N{MODIFIER LETTER TURNED COMMA}',    # u'\u02bb'
    u'\N{ARMENIAN APOSTROPHE}',             # u'\u055a'
    u'\N{LATIN SMALL LETTER SALTILLO}',     # u'\ua78c'
    u'\N{PRIME}',                           # u'\u2032'
    u'\N{REVERSED PRIME}',                  # u'\u2035'
    u'\N{MODIFIER LETTER PRIME}',           # u'\u02b9'
    u'\N{FULLWIDTH APOSTROPHE}',            # u'\uff07'

DATE_ORDER_PATTERN = re.compile(u'([DMY])+\u200f*[-/. \t]*([DMY])+\u200f*[-/. \t]*([DMY])+')
RELATIVE_PATTERN = re.compile(r'(?
github vertexproject / synapse / synapse / models / View on Github external
import logging
import ipaddress
import email.utils

import regex

import synapse.exc as s_exc
import synapse.common as s_common
import synapse.lib.chop as s_chop
import synapse.lib.types as s_types
import synapse.lib.scrape as s_scrape
import synapse.lib.module as s_module
import synapse.lookup.iana as s_l_iana

logger = logging.getLogger(__name__)
fqdnre = regex.compile(r'^[\w._-]+$', regex.U)
srv6re = regex.compile(r'^\[([a-f0-9:]+)\]:(\d+)$')

cidrmasks = [((0xffffffff - (2 ** (32 - i) - 1)), (2 ** (32 - i))) for i in range(33)]

def getAddrType(ip):

    if ip.is_multicast:
        return 'multicast'

    if ip.is_loopback:
        return 'loopback'

    if ip.is_link_local:
        return 'linklocal'
github estnltk / estnltk / estnltk / legacy / View on Github external
    def split_by_regex(self, regex_or_pattern, flags=re.U, gaps=True):
        """Split the text into multiple instances using a regex.

        regex_or_pattern: str or compiled pattern
            The regular expression to use for splitting.
        flags: int (default: re.U)
            The regular expression flags (only used, when user has not supplied compiled regex).
        gaps: boolean (default: True)
            If True, then regions matched by the regex are not included in the resulting Text instances, which
            is expected behaviour.
            If False, then only regions matched by the regex are included in the result.

        list of Text
github scrapinghub / dateparser / dateparser / languages / View on Github external
def _get_simplifications(self, settings=None):
        no_word_spacing = eval('no_word_spacing', 'False'))
        if settings.NORMALIZE:
            if self._normalized_simplifications is None:
                self._normalized_simplifications = []
                simplifications = self._generate_simplifications(normalize=True)
                for simplification in simplifications:
                    pattern, replacement = list(simplification.items())[0]
                    if not no_word_spacing:
                        pattern = r'(?<=\A|\W|_)%s(?=\Z|\W|_)' % pattern
                    pattern = re.compile(pattern, flags=re.I | re.U)
                    self._normalized_simplifications.append({pattern: replacement})
            return self._normalized_simplifications

            if self._simplifications is None:
                self._simplifications = []
                simplifications = self._generate_simplifications(normalize=False)
                for simplification in simplifications:
                    pattern, replacement = list(simplification.items())[0]
                    if not no_word_spacing:
                        pattern = r'(?<=\A|\W|_)%s(?=\Z|\W|_)' % pattern
                    pattern = re.compile(pattern, flags=re.I | re.U)
                    self._simplifications.append({pattern: replacement})
            return self._simplifications
github dmort27 / epitran / epitran / View on Github external
def __init__(self, arpabet='arpabet', ligatures=False, cedict_file=None):
        """Construct a Flite "wrapper"

            arpabet (str): file containing ARPAbet to IPA mapping
            ligatures (bool): if True, use non-standard ligatures instead of
                              standard IPA
            cedict_filename (str): path to CC-CEDict dictionary (included for
        arpabet = pkg_resources.resource_filename(__name__, os.path.join('data', arpabet + '.csv'))
        self.arpa_map = self._read_arpabet(arpabet)
        self.chunk_re = re.compile(r"([A-Za-z'’]+|[^A-Za-z'’]+)", re.U)
        self.letter_re = re.compile(r"[A-Za-z'’]+")
        self.regexp = re.compile(r'[A-Za-z]')
        self.puncnorm = PuncNorm()
        self.ligatures = ligatures
        self.ft = panphon.FeatureTable()
        self.num_panphon_fts = len(self.ft.names)
github osm-fr / osmose-backend / modules / View on Github external
def test_diff_char(self):
        # check that confusables_data doesn't propose the same character
        import regex
        wrong = 0
        for group in confusables_data.confusables.values():
            proposals = confusables_data.confusables_fix.get(group)
            for (script, prop) in proposals.items():
                re = regex.compile(r"[\p{%s}]" % script, flags=regex.V1 | regex.U)
                if re.match(prop):
                elif group == prop:
                    wrong += 1
                    print("group=%s, script=%s, prop=%s" % (group, script, prop))
        assert wrong == 0
github alan-turing-institute / defoe / defoe / papers / queries / View on Github external
def do_query(issues, interesting_words_file, _):
    Get the words which appear together in articles.
    # Get the list of words to search for
    interesting_words = [re.compile(r'\b' + word.strip() + r'\b', re.I | re.U)
                         for word in list(open(interesting_words_file))]
    # Map each article in each issue to a year of publication
    articles = issues.flatMap(lambda issue: [(,
                                              article) for
                                             article in
    # Find the words for each article
    interest = articles.flatMap(make_search(interesting_words))
    # Now add sum the year-word counts, and change the format for output
    interesting_by_year = interest \
        .reduceByKey(add) \
        .map(split_key) \
        .groupByKey() \
        .map(snd_to_list) \
    return interesting_by_year
github scrapinghub / dateparser / dateparser / languages / View on Github external
from six.moves import zip_longest

from dateparser.utils import normalize_unicode

PARSER_HARDCODED_TOKENS = [":", ".", " ", "-", "/"]
PARSER_KNOWN_TOKENS = ["am", "pm", "UTC", "GMT", "Z"]
KNOWN_WORD_TOKENS = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday',
                     'saturday', 'sunday', 'january', 'february', 'march',
                     'april', 'may', 'june', 'july', 'august', 'september',
                     'october', 'november', 'december', 'year', 'month', 'week',
                     'day', 'hour', 'minute', 'second', 'ago', 'in', 'am', 'pm']

PARENTHESES_PATTERN = re.compile(r'[\(\)]')
NUMERAL_PATTERN = re.compile(r'(\d+)')
KEEP_TOKEN_PATTERN = re.compile(r"^.*[^\W_].*$", flags=re.U)

class UnknownTokenError(Exception):

class Dictionary(object):
    Class that modifies and stores translations and handles splitting of date string.

    :param locale_info:
        Locale info (translation data) of the locale.
    :type language_info: dict

    :param settings:
        Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.