How to use the regex.escape function in regex

To help you get started, we’ve selected a few regex examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github daniel-kukiela / nmt-chatbot / core / tokenizer.py View on Github external
position = p.start(1) + diffrence
                    diffrence += -len(replace_from) + len(replace_to)

                    # Remove spaces
                    answer = answer[:position] + answer[position:].replace(replace_from, replace_to, 1)

        # Change placeholders back to spaces
        answer = answer.replace("##DONOTTOUCHTHISSPACE##", ' ')

        detokenized_answers.append(answer)

    return detokenized_answers


# Prepare vocab tokens from line
re_split = re.compile('(?: |^)(?:▁(▁))?([' + re.escape(r'`~!@#$%^&*()-_=+{[}]:;\'",<>?/|\\') + '0-9]|newlinechar|\.+)')
def sentence_split(sentence):

    # If not an embedded detokenizer - split by spaces
    if not preprocessing['embedded_detokenizer']:
        return sentence.split()

    global re_split

    # Prepare for split sentence into a words by ' ▁'
    line = ' ▁▁' + sentence[1:].replace('▁', '▁▁')
    line = re_split.sub(r' ▁\1\2 ▁', line)

    # split, filer and return
    return list(filter(lambda line: False if len(line) == 0 or line == '▁' else True, [token.strip() for token in line.split(' ▁')]))

# Load json file with BPE join pairs
github daniel-kukiela / nmt-chatbot / core / scorer.py View on Github external
def check_urls(index, question, answer):
    global full_sentence_valid_url

    full_sentence_valid_url = False
    valid_url = False

    # Disabled
    if score_settings['incorrect_url_modifier_value'] is None:
        return 0

    # Find all utls in sentence
    for url in re.finditer('http(?:s?):(//([^/]*?)/(?:[^ ])*?(?=$|[' + re.escape(score_settings['url_delimiters']) + ']))?', answer):

        # Check if result is in cache already and return it
        if url_cache[url.group(0)][1] > time.time():
            if url_cache[url.group(0)][0] == 0:
                return score_settings['incorrect_url_modifier_value']

        # Url not in cache - check it
        else:

            # Send HEAD request and check HTTP response code
            try:
                request = requests.head(url.group(0))
                code = request.status_code
            except Exception as e:
                code = 0
github chuanconggao / extratools / extratools / strtools.py View on Github external
def __findtagpairspans(
        s: str,
        tag: str, closetag: Optional[str] = None,
        useregex: bool = False
    ) -> Iterable[Tuple[Tuple[int, int], ...]]:
    if closetag is None or tag == closetag:
        yield from __findeqtagpairspans(s, tag, useregex=useregex)
        return

    if not useregex:
        tag = re.escape(tag)
        closetag = re.escape(closetag)

    retags = re.compile(r"(?P<__open>{})|(?P<__close>{})".format(tag, closetag))

    startspans = []

    for match in retags.finditer(s):
        opengroup = match.group("__open")
        if opengroup:
            startspans.append(match.span())
            continue

        closegroup = match.group("__close")
        if closegroup and startspans:
            startspan = startspans.pop()
            endspan = match.span()
github chuanconggao / extratools / extratools / strtools.py View on Github external
def extract(s: str, entities: Iterable[str], useregex=False, ignorecase=True) -> Iterable[str]:
    for m in re.compile(
            r"\b(?:{})\b".format(r"|".join(
                e if useregex else re.escape(e).replace(' ', r"s+") for e in entities
            )),
            re.I if ignorecase else 0
        ).finditer(s):
        yield m.group(0)
github norbusan / calibre-debian / src / calibre / gui2 / tweak_book / reports.py View on Github external
if boss is None:
        return
    name = loc.name
    editor = boss.edit_file_requested(name)
    if editor is None:
        return
    editor = editor.editor
    if loc.line_number is not None:
        block = editor.document().findBlockByNumber(loc.line_number - 1)  # blockNumber() is zero based
        if not block.isValid():
            return
        c = editor.textCursor()
        c.setPosition(block.position(), c.MoveAnchor)
        editor.setTextCursor(c)
        if loc.text_on_line is not None:
            editor.find(regex.compile(regex.escape(loc.text_on_line)))
github PolCPP / PSO2es-Translation / _tools / ImportItemSetContents.py View on Github external
contents_file_name = "Item_" + contents_file_name + ".txt"
    try:
        contents_file = codecs.open(os.path.join(json_loc, contents_file_name),
                                    mode='r', encoding='utf-8')
    except FileNotFoundError:
        print("\t{0} not found.".format(contents_file_name))
        continue

    contents = json.load(contents_file)
    print("{0} loaded.".format(contents_file_name))

    repcount = 0  # Number of items in ItemBags translated from this file
    for item in contents:
        name_en = item["tr_text"]
        if name_en != "":
            name_jp = regex.escape(item["jp_text"])  # Escape [] in In/Ba/Ou
            repcount += len(regex.findall("\[" + name_jp + "\]", itembags))
            itembags = regex.sub("\[" + name_jp + "\]",
                                 "[" + name_en + "]",
                                 itembags)
    print("  Translated {0} item name{1}."
          .format(repcount,
                  "" if repcount == 1 else "s"))
    contents_file.close()
    print("{0} closed.".format(contents_file_name))

# Clean up eyelash colours and Cast parts
itembags = regex.sub('Black\] \(4 colors\)',
                     '(4 colors)]',
                     itembags)
itembags = regex.sub('Black (.+)\] \(4 colors\)',
                     r'\1 (4 colors)]',
github Sefaria / Sefaria-Project / scripts / count_index_dependencies.py View on Github external
def dep_counts(name):
    ref_patterns = {
        'alone': r'^{} \d'.format(re.escape(name)),
        'commentor': r'{} on'.format(re.escape(name)),
        'commentee': r'on {} \d'.format(re.escape(name))
    }

    commentee_title_pattern = r'on {}'.format(re.escape(name))

    ret = {
        'version title exact match': text.VersionSet({"title": name}).count(),
        'version title match commentor': text.VersionSet({"title": {"$regex": ref_patterns["commentor"]}}).count(),
        'version title match commentee': text.VersionSet({"title": {"$regex": commentee_title_pattern}}).count(),
        'history title exact match': history.HistorySet({"title": name}).count(),
        'history title match commentor': history.HistorySet({"title": {"$regex": ref_patterns["commentor"]}}).count(),
        'history title match commentee': history.HistorySet({"title": {"$regex": commentee_title_pattern}}).count(),
    }

    for pname, pattern in ref_patterns.items():
github somebody1234 / Charcoal / wolfram.py View on Github external
def __radd__(self, other):
        if isinstance(other, String):
            return Pattern(re_escape(str(other)) + str(self))
        if isinstance(other, str):
            return Pattern(re_escape(other) + str(self))
github hellohaptik / chatbot_ner / ner_v1 / detectors / textual / text / text_detection.py View on Github external
if u' '.join(TOKENIZER.tokenize(variant)) in _text[index]:
                    exact_matches.append(variant)
                else:
                    fuzzy_variants.append(variant)
            exact_matches.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True)
            fuzzy_variants.sort(key=lambda s: len(TOKENIZER.tokenize(s)), reverse=True)
            variants_list = exact_matches + fuzzy_variants

            for variant in variants_list:

                original_text = self._get_entity_substring_from_text(self.__processed_texts[index], variant)
                if original_text:
                    value_final_list.append(variants_to_values[variant])
                    original_final_list.append(original_text)

                    boundary_punct_pattern = re.compile(r'(^[{0}]+)|([{0}]+$)'.format(re.escape(string.punctuation)))
                    original_text_ = boundary_punct_pattern.sub("", original_text)

                    _pattern = re.compile(r'\b%s\b' % re.escape(original_text_), flags=_re_flags)
                    self.__tagged_texts[index] = _pattern.sub(self.tag, self.__tagged_texts[index])
                    # Instead of dropping completely like in other entities,
                    # we replace with tag to avoid matching non contiguous segments
                    self.__processed_texts[index] = _pattern.sub(self.tag, self.__processed_texts[index])
            value_final_list_.append(value_final_list)
            original_final_list_.append(original_final_list)

        return value_final_list_, original_final_list_
github skoczen / will / will / backends / generation / fuzzy_best_match.py View on Github external
method_path = method_meta["plugin_info"]["parent_path"]
        if not method_path in self.cached_regex:

            regex_string = method_meta["regex_pattern"]
            if "case_sensitive" in method_meta and not method_meta["case_sensitive"]:
                regex_string = "(?i)%s" % regex_string

            if method_meta["multiline"]:
                try:
                    self.cached_regex[method_path] = regex.compile("%s{e<=%s}" % (
                        regex_string,
                        settings.FUZZY_REGEX_ALLOWABLE_ERRORS
                    ), regex.MULTILINE | regex.DOTALL | regex.ENHANCEMATCH)
                except:
                    self.cached_regex[method_path] = regex.compile("%s{e<=%s}" % (
                        regex.escape(regex_string),
                        settings.FUZZY_REGEX_ALLOWABLE_ERRORS
                    ), regex.MULTILINE | regex.DOTALL | regex.ENHANCEMATCH)
            else:
                try:
                    self.cached_regex[method_path] = regex.compile("%s{e<=%s}" % (
                        regex_string,
                        settings.FUZZY_REGEX_ALLOWABLE_ERRORS
                    ), regex.ENHANCEMATCH)
                except:
                    self.cached_regex[method_path] = regex.compile("%s{e<=%s}" % (
                        regex.escape(regex_string),
                        settings.FUZZY_REGEX_ALLOWABLE_ERRORS
                    ), regex.ENHANCEMATCH)

        return self.cached_regex[method_path]