How to use the regex.UNICODE function in regex

To help you get started, we’ve selected a few regex examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github facebookresearch / ParlAI / parlai / agents / tfidf_retriever / tokenizers / simple_tokenizer.py View on Github external
def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
        """
        self._regexp = regex.compile(
            '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE,
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning(
                '%s only tokenizes! Skipping annotators: %s'
                % (type(self).__name__, kwargs.get('annotators'))
            )
        self.annotators = set()
github thunlp / OpenQA / main.py View on Github external
def has_answer(args, answer, t):
    global PROCESS_TOK
    text = []
    for i in range(len(t)):
        text.append(t[i].lower())
    res_list = []
    if (args.dataset == "CuratedTrec"):
        try:
            ans_regex = re.compile("(%s)"%answer[0], flags=re.IGNORECASE + re.UNICODE)
        except:
            return False, res_list
        paragraph = " ".join(text)
        answer_new = ans_regex.findall(paragraph)
        for a in answer_new:
            single_answer = normalize(a[0])
            single_answer = PROCESS_TOK.tokenize(single_answer)
            single_answer = single_answer.words(uncased=True)
            for i in range(0, len(text) - len(single_answer) + 1):
                if single_answer == text[i: i + len(single_answer)]:
                    res_list.append((i, i+len(single_answer)-1))
    else:
        for a in answer:
            single_answer = " ".join(a).lower()
            single_answer = normalize(single_answer)
            single_answer = PROCESS_TOK.tokenize(single_answer)
github norbusan / calibre-debian / src / calibre / ebooks / oeb / polish / stats.py View on Github external
def __init__(self, container, do_embed=False):
        if self.first_letter_pat is None:
            StatsCollector.first_letter_pat = self.first_letter_pat = regex.compile(
                r'^[\p{Ps}\p{Ps}\p{Pe}\p{Pi}\p{Pf}\p{Po}]+', regex.VERSION1 | regex.UNICODE)
            StatsCollector.capitalize_pat = self.capitalize_pat = regex.compile(
                r'[\p{L}\p{N}]', regex.VERSION1 | regex.UNICODE)

        self.collect_font_stats(container, do_embed)
github fnl / syntok / syntok / tokenizer.py View on Github external
_apostrophe_t = regex.compile('[' + _apostrophes + ']t')
    """Apostrophe-t regex, to detect "n't" suffixes."""

    # about 25% of the runtime of the tokenizer is spent with this regex
    _separation = regex.compile(
        r"(?<=\p{Ll})[.!?]?(?=\p{Lu})|" +  # lowercase-uppercase transitions
        r"[" + _apostrophes + r"]\p{L}+|" +  # apostrophes and their tail
        r"[\p{Ps}\p{Pe}]|" +   # parenthesis and open/close punctuation
        r"\.\.\.|" +  # inner ellipsis
        r"(?<=\p{L})[,;_" + _hyphens + r"](?=[\p{L}\p{Nd}])|" +  # dash-not-digits transition prefix
        r"(?<=[\p{L}\p{Nd}])[,;_" + _hyphens + r"](?=\p{L})"  # dash-not-digits transition postfix
    )
    """Secondary regex to sub-split non-whitespace sequences."""

    _spaces = regex.compile(r"\S+", regex.UNICODE)
    """Primary regex to split strings at any kind of Unicode whitespace."""

    @staticmethod
    def join_hyphenated_words_across_linebreaks(text: str) -> str:
        """Join 'hyhen-\\n ated wor- \\nds' to 'hyphenated words'."""
        return Tokenizer._hyphen_newline.subn("", text)[0]

    @staticmethod
    def to_text(tokens: List[Token]) -> str:
        """
        Reconstruct the original text where the Tokens were found.

        This works because a Token stores its spacing prefix.
        """
        return "".join(map(str, tokens))
github norbusan / calibre-debian / src / calibre / ebooks / css_transform_rules.py View on Github external
def compile_pat(pat):
    import regex
    REGEX_FLAGS = regex.VERSION1 | regex.UNICODE | regex.IGNORECASE
    return regex.compile(pat, flags=REGEX_FLAGS)
github ufal / mtmonkey / worker / src / util / tokenize.py View on Github external
def __init__(self, options={}):
        """\
        Constructor (pre-compile all needed regexes).
        """
        # process options
        self.lowercase = True if options.get('lowercase') else False
        self.moses_escape = True if options.get('moses_escape') else False
        # compile regexes
        self.__spaces = Regex(r'\s+', flags=UNICODE)
        self.__ascii_junk = Regex(r'[\000-\037]')
        self.__special_chars = \
                Regex(r'(([^\p{IsAlnum}\s\.\,−\-])\2*)')
        # single quotes: all unicode quotes + prime
        self.__to_single_quotes = Regex(r'[`‛‚‘’‹›′]')
        # double quotes: all unicode chars incl. Chinese + double prime + ditto
        self.__to_double_quotes = Regex(r'(\'\'|``|[«»„‟“”″〃「」『』〝〞〟])')
        self.__no_numbers = Regex(r'([^\p{N}])([,.])([^\p{N}])')
        self.__pre_numbers = Regex(r'([^\p{N}])([,.])([\p{N}])')
        self.__post_numbers = Regex(r'([\p{N}])([,.])([^\p{N}])')
        # hyphen: separate every time but for unary minus
        self.__minus = Regex(r'([-−])')
        self.__pre_notnum = Regex(r'(-)([^\p{N}])')
        self.__post_num_or_nospace = Regex(r'(\p{N} *|[^ ])(-)')
github OpenEdition / bilbo / src / bilbo / format / Clean.py View on Github external
xmlEntitiesDecode
		"""
		#for numerical codes
		matches = re.findall("&#\d+;", tmp_str, flags=re.UNICODE)
		if len(matches) > 0 :
			hits = set(matches)
			for hit in hits :
				name = hit[2:-1]
				try :
					entnum = int(name)
					tmp_str = tmp_str.replace(hit, unichr(entnum))
				except ValueError:
					pass
	
		#for hex codes
		matches = re.findall("&#[xX][0-9a-fA-F]+;", tmp_str, flags=re.UNICODE)
		if len(matches) > 0 :
			hits = set(matches)
			for hit in hits :
				hex = hit[3:-1]
				try :
					entnum = int(hex, 16)
					tmp_str = tmp_str.replace(hit, unichr(entnum))
				except ValueError:
					pass
		
		return tmp_str
github eflglobal / filters / filters / string.py View on Github external
super(Unicode, self).__init__()

        self.encoding   = encoding
        self.normalize  = normalize

        if self.normalize:
            #
            # Compile the regex that we will use to remove non-
            # printables from the resulting unicode.
            # http://www.regular-expressions.info/unicode.html#category
            #
            # Note: using a double negative so that we can exclude
            # newlines, which are technically considered control chars.
            # http://stackoverflow.com/a/3469155
            #
            self.npr = regex.compile(r'[^\P{C}\s]+', regex.UNICODE)
github ufal / neuralmonkey / scripts / tokenize_data.py View on Github external
sys.stderr = codecs.getwriter('utf-8')(sys.stderr)

    try:
        if args.language == "german":
            decompounder = get_decompounder()
            decompounded_count = 0

        for ln, line in enumerate(sys.stdin):
            line = re.sub(r"[[:space:]]+", " ", line.rstrip())
            line = re.sub(r"^[[:space:]]+", "", line)
            line = re.sub(r"''", "\"", line)
            line = re.sub(r"``", "\"", line)
            line = re.sub(r"-([[:punct:]\$])", "\g<1>", line)
            line = re.sub(r"([[:punct:]\$])-", "\g<1>", line)
            line = re.sub(r"^[[:space:]]*-[[:space:]]", "", line)
            line = re.sub(r"([[:alpha:]0-9ß])-([ [:punct:]])", "\g<1>\g<2>", line, re.UNICODE)
            line = re.sub(r"([ [:punct:]])-([[:alpha:]0-9ß])", "\g<1>\g<2>", line, re.UNICODE)
            line = re.sub(r" - ", " – ", line)
            line = re.sub(r"– -", "–", line)

            def normalize_quotes(token):
                token = re.sub(r"-$", '', token)
                token = re.sub(r"``", '\u201c', token)
                token = re.sub(r"''", '\u201d', token)
                return token

            tokenized = [normalize_quotes(t) for t in word_tokenize(line, language=args.language)]

            if args.language == "german":
                for i, token in enumerate(tokenized):
                    decompounded_count += 1
                    decompounded = decompounder.splitWord(token)
github kovidgoyal / calibre / src / calibre / ebooks / oeb / polish / stats.py View on Github external
def __init__(self, container, do_embed=False):
        if self.first_letter_pat is None:
            StatsCollector.first_letter_pat = self.first_letter_pat = regex.compile(
                r'^[\p{Ps}\p{Ps}\p{Pe}\p{Pi}\p{Pf}\p{Po}]+', regex.VERSION1 | regex.UNICODE)
            StatsCollector.capitalize_pat = self.capitalize_pat = regex.compile(
                r'[\p{L}\p{N}]', regex.VERSION1 | regex.UNICODE)

        self.collect_font_stats(container, do_embed)