How to use the regex.finditer function in regex

To help you get started, we’ve selected a few regex examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github TaoMiner / JointTextKgLearning / preprocess / preprocess.py View on Github external
line_count += 1
                if line_count%10000 == 0 :
                    endtime = datetime.datetime.now()
                    print("has processed: %d lines, takes %d seconds..." % (line_count, (endtime - starttime).seconds))
                # split the paragraphs after removing references, head entity and href
                paras = re.split(para_p, re.sub(trim_href_p, "", line.lower()))
                for para in paras:
                    sent_pos = 0
                    words_set = []
                    entity_index = []
                    ent_dic = {}
                    # skip the para within length of 30 or Nonetype
                    if not para or len(para) <=30:
                        continue
                    # iterate all the anchors in wiki text
                    for anchor in re.finditer(anchor_p, para):
                        segment(para[sent_pos:anchor.start()], words_set)
                        anchor_word = toWord(anchor.group(), ent_dic)
                        if anchor_word:
                            entity_index.append(len(words_set))
                            words_set.append(anchor_word)
                        sent_pos = anchor.end()
                    if sent_pos < len(para):
                        segment(para[sent_pos:len(para)], words_set)
                    if len(words_set) > 8:
                        texts.append(" ".join(words_set)+"\n")
                        if len(texts) >= 10000:
                            fout_text.writelines(texts)
                            del texts[:]
                    for i in entity_index:
                        anchors.append(ent_dic[words_set[i]]+"\t\t"+";".join(reversed(words_set[max(0,i-ent_half_window-1):i]))+"\n")
                        anchors.append(ent_dic[words_set[i]] + "\t\t"+";".join(words_set[i+1:min(len(words_set), i+1+ent_half_window)])+"\n")
github delph-in / pydelphin / delphin / repp.py View on Github external
def _tokenize(result: REPPResult, pattern: str) -> List[Tuple[int, int, str]]:
    s, sm, em = result  # unpack for efficiency in loop
    toks = []
    pos = 0
    for m in re.finditer(pattern, result.string):
        if pos < m.start():
            toks.append((pos + sm[pos + 1],
                         m.start() + em[m.start()],
                         s[pos:m.start()]))
        pos = m.end()
    if pos < len(s):
        toks.append((pos + sm[pos + 1],
                     len(s) + em[len(s)],
                     s[pos:]))
    return toks
github stenskjaer / samewords / samewords / document.py View on Github external
def chunk_doc(content: str) -> List[str]:
    """
    Split document into a list of chunks. All unequal numbered indices are
    numbered text.

    :param content: The content of the document as a string.
    """
    starts = regex.finditer(r"\\beginnumbering\n", content)
    ends = regex.finditer(r"\n\\endnumbering", content)
    if "\\beginnumbering\n" in content:
        indices = []
        for start, end in zip(starts, ends):
            if not indices:
                # Setup the indices with the slice before first numbered text
                indices.append([0, start.span()[0]])
            else:
                # Add the indices between previous numbered section and next
                indices.append([indices[-1][-1], start.span()[0]])
            # Now, add the indices of the numbered section
            indices.append([start.span()[0], end.span()[1]])
        # Add the tail from last numbered to end
        try:
            indices.append([indices[-1][-1], len(content) + 1])
        except IndexError:
            raise ValueError(
github microsoft / Recognizers-Text / Python / libraries / recognizers-date-time / recognizers_date_time / date_time / base_merged.py View on Github external
def try_merge_modifier_token(self, extract_result: ExtractResult, pattern: Pattern, source: str,
                                 potential_ambiguity: bool = False) -> bool:
        before_str = source[0:extract_result.start]
        after_str = source[extract_result.start: extract_result.length]

        # Avoid adding mod for ambiguity cases, such as "from" in "from ... to ..." should not add mod
        if potential_ambiguity and self.config.ambiguous_range_modifier_prefix and \
                regex.search(self.config.ambiguous_range_modifier_prefix, before_str):
            matches = list(regex.finditer(self.config.potential_ambiguous_range_regex, source))
            if matches and len(matches):
                return any(match.start() < extract_result.start + extract_result.length and match.end() > extract_result.start for match in matches)
                # return self._filter_item(extract_result, matches)

        token = self.has_token_index(before_str.strip(), pattern)
        if token.matched:
            mod_len = len(before_str) - token.index
            extract_result.length += mod_len
            extract_result.start -= mod_len
            extract_result.text = source[extract_result.start:extract_result.start + extract_result.length]

            extract_result.meta_data = self.assign_mod_metadata(extract_result.meta_data)
            return True
        elif self.config.check_both_before_after:
            # check also after_str
            after_str = source[extract_result.start: extract_result.length]
github microsoft / Recognizers-Text / Python / libraries / recognizers-date-time / recognizers_date_time / date_time / base_datetime.py View on Github external
def basic_regex_match(self, source: str) -> List[Token]:
        tokens: List[Token] = list()
        # handle "now"
        matches: List[Match] = list(
            regex.finditer(self.config.now_regex, source))
        tokens.extend(map(lambda x: Token(x.start(), x.end()), matches))
        return tokens
github facelessuser / backrefs / backrefs / bregex.py View on Github external
def finditer(pattern, string, *args, **kwargs):
    """Wrapper for `finditer`."""

    flags = args[2] if len(args) > 2 else kwargs.get('flags', 0)
    return _regex.finditer(_apply_search_backrefs(pattern, flags), string, *args, **kwargs)
github microsoft / Recognizers-Text / Python / libraries / recognizers-sequence / recognizers_sequence / sequence / extractors.py View on Github external
            map(lambda x: MatchesVal(matches=list(re.finditer(x.re, source)), val=x.val), self.regexes))
        matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list))
github chuanconggao / extratools / extratools / strtools.py View on Github external
def __findeqtagpairspans(
        s: str,
        tag: str,
        useregex: bool = False
    ) -> Iterable[Tuple[Tuple[int, int], ...]]:
    for match in re.finditer(r"(?P<__open>{})(?P<__content>.*?)(?P<__close>\1)".format(tag if useregex else re.escape(tag)), s):
        yield (match.span("__open"), match.span("__content"), match.span("__close"))
github microsoft / Recognizers-Text / Python / libraries / recognizers-date-time / recognizers_date_time / date_time / chinese / merged_extractor.py View on Github external
def _filter_ambiguity(self, extract_results: List[ExtractResult], text: str, ) -> List[ExtractResult]:

        if self.config.ambiguity_filters_dict is not None:
            for regex_var in self.config.ambiguity_filters_dict:
                regex_var_value = self.config.ambiguity_filters_dict[regex_var]

                try:
                    reg_len = list(filter(lambda x: x.group(), regex.finditer(regex_var_value, text)))

                    reg_length = len(reg_len)
                    if reg_length > 0:

                        matches = reg_len
                        new_ers = list(filter(lambda x: list(
                            filter(lambda m: m.start() < x.start + x.length and m.start() +
                                   len(m.group()) > x.start, matches)), extract_results))
                        if len(new_ers) > 0:
                            for item in extract_results:
                                for i in new_ers:
                                    if item is i:
                                        extract_results.remove(item)
                except Exception:
                    pass
github tsproisl / SoMaJo / somajo / tokenizer.py View on Github external
def _split_emojis(self, node, token_class="emoticon"):
        boundaries = []
        for m in re.finditer(r"\X", node.value.text):
            if m.end() - m.start() > 1:
                if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}\uFE0F]", m.group()):
                    boundaries.append((m.start(), m.end(), None))
            else:
                if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}]", m.group()):
                    boundaries.append((m.start(), m.end(), None))
        self._split_on_boundaries(node, boundaries, token_class)