How to use the regex.sub function in regex

To help you get started, we’ve selected a few regex examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github andychase / reparse / reparse / util.py View on Github external
def separate_string(string):
    """
    >>> separate_string("test <2>")
    (['test ', ''], ['2'])
    """
    string_list = regex.split(r'<(?![!=])', regex.sub(r'>', '<', string))
    return string_list[::2], string_list[1::2]  # Returns even and odd elements
github abuccts / wikt2pron / pywiktionary / IPA / IPA.py View on Github external
Notes
    -----
    - Use ``_j`` for palatalized instead of ``'``
    - Use ``=`` for syllabic instead of ``_=``
    - Use ``~`` for nasalization instead of ``_~``
    - Please refer to :doc:`sym` for more details.

    Examples
    --------
    >>> IPA_text = "/t͡ʃeɪnd͡ʒ/" # en: [[change]]
    >>> XSAMPA_text = IPA_to_XSAMPA(IPA_text)
    >>> XSAMPA_text
    "/t__SeInd__Z/"
    """
    text = re.sub("ːː", ":", text)
    text += " "
    XSAMPA_lst = []
    i = 0
    while i < len(text) - 1:
        if text[i:i+2] in i2x_lookup.keys():
            XSAMPA_lst.append(i2x_lookup[text[i:i+2]])
            i += 1
        elif text[i] in i2x_lookup.keys():
            XSAMPA_lst.append(i2x_lookup[text[i]])
        else:
            XSAMPA_lst.append(text[i])
        i += 1
    return "".join(XSAMPA_lst)
github standardebooks / tools / se / se_epub_build.py View on Github external
processed_line = regex.sub(r"&lt;((?:m:)?m(sub|sup))&gt;&lt;((?:m:)?mi)&gt;(.+?)&lt;((?:m:)?mi)&gt;(.+?)", "<i>\\4</i>&lt;\\2&gt;<i>\\6</i>", processed_line)
										processed_line = regex.sub(r"&lt;((?:m:)?m(sub|sup))&gt;&lt;((?:m:)?mi)&gt;(.+?)&lt;((?:m:)?mn)&gt;(.+?)", "<i>\\4</i>&lt;\\2&gt;\\6", processed_line)
										processed_line = regex.sub(r"&lt;((?:m:)?m(sub|sup))&gt;&lt;((?:m:)?mn)&gt;(.+?)&lt;((?:m:)?mn)&gt;(.+?)", "\\4&lt;\\2&gt;\\6", processed_line)
										processed_line = regex.sub(r"&lt;((?:m:)?m(sub|sup))&gt;&lt;((?:m:)?mn)&gt;(.+?)&lt;((?:m:)?mi)&gt;(.+?)", "\\4&lt;\\2&gt;<i>\\6</i>", processed_line)
										processed_line = regex.sub(r"&lt;((?:m:)?m(sub|sup))&gt;&lt;((?:m:)?mi) mathvariant=\"normal\"&gt;(.+?)&lt;((?:m:)?mi)&gt;(.+?)", "\\4&lt;\\2&gt;<i>\\6</i>", processed_line)
										processed_line = regex.sub(r"&lt;((?:m:)?m(sub|sup))&gt;&lt;((?:m:)?mi) mathvariant=\"normal\"&gt;(.+?)&lt;((?:m:)?mn)&gt;(.+?)", "\\4&lt;\\2&gt;\\6", processed_line)
										processed_line = regex.sub(r"&lt;(?:m:)?mo&gt;{}".format(se.FUNCTION_APPLICATION), "", processed_line, flags=regex.IGNORECASE) # The ignore case flag is required to match here with the special FUNCTION_APPLICATION character, it's unclear why
										processed_line = regex.sub(r"&lt;(?:m:)?mfenced&gt;&lt;((?:m:)(?:mo|mi|mn|mrow))&gt;(.+?)", "(&lt;\\1&gt;\\2)", processed_line)
										processed_line = regex.sub(r"&lt;(?:m:)?mrow&gt;([^&gt;].+?)", "\\1", processed_line)
										processed_line = regex.sub(r"&lt;(?:m:)?mi&gt;([^&lt;]+?)", "<i>\\1</i>", processed_line)
										processed_line = regex.sub(r"&lt;(?:m:)?mi mathvariant=\"normal\"&gt;([^&lt;]+?)", "\\1", processed_line)
										processed_line = regex.sub(r"&lt;(?:m:)?mo&gt;([+\-−=×])", " \\1 ", processed_line)
										processed_line = regex.sub(r"&lt;((?:m:)?m[no])&gt;(.+?)", "\\2", processed_line)
										processed_line = regex.sub(r"", "", processed_line)
										processed_line = processed_line.strip()
										processed_line = regex.sub(r"<i>", "", processed_line, flags=regex.DOTALL)

									# Did we succeed? Is there any more MathML in our string?
									if regex.findall("".format(mathml_count))
										mathml_count = mathml_count + 1
									else:
										# Success! Replace the MathML with our new string.
										processed_xhtml = processed_xhtml.replace(line, processed_line)

							if processed_xhtml != xhtml:
								file.seek(0)
								file.write(processed_xhtml)
								file.truncate()</i>
github xiangzhemeng / Kaggle-Twitter-Sentiment-Analysis / data_preprocessing.py View on Github external
def abbreviation_replacement(text):
    text = re.sub(r"i\'m", "i am", text)
    text = re.sub(r"\'re", "are", text)
    text = re.sub(r"he\'s", "he is", text)
    text = re.sub(r"it\'s", "it is", text)
    text = re.sub(r"that\'s", "that is", text)
    text = re.sub(r"who\'s", "who is", text)
    text = re.sub(r"what\'s", "what is", text)
    text = re.sub(r"n\'t", "not", text)
    text = re.sub(r"\'ve", "have", text)
    text = re.sub(r"\'d", "would", text)
    text = re.sub(r"\'ll", "will", text)
    text = re.sub(r",", " , ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\.", " \. ", text)
    text = re.sub(r"\(", " \( ", text)
    text = re.sub(r"\)", " \) ", text)
    text = re.sub(r"\?", " \? ", text)
    return text
github robshakir / pyangbind / pyangbind / lib / yangtypes.py View on Github external
if chk_path in extmethods:
            for method in [i for i in dir(extmethods[chk_path]) if not i.startswith("_")]:
                clsslots.append("_" + method)

    class YANGBaseClass(base_type):
        # we only create slots for things that are restricted
        # in adding attributes to them - this means containing
        # data nodes. This means that we can allow
        # leaf._someattr to be used by consuming code - it
        # also fixes an issue whereby we could set __slots__
        # and try and inherit a variable-length inbuilt such
        # as long, which is not allowed.
        if yang_type in ["container", "list"] or is_container == "container":
            __slots__ = tuple(clsslots)

        _pybind_base_class = regex.sub("&lt;(type|class) '(?P.*)'&gt;", "\g", str(base_type))

        def __new__(self, *args, **kwargs):
            try:
                obj = base_type.__new__(self, *args, **kwargs)
            except TypeError:
                obj = base_type.__new__(self)
            return obj

        def __init__(self, *args, **kwargs):
            self._default = False
            self._mchanged = False
            self._yang_name = yang_name
            self._parent = parent_instance
            self._choice = choice_member
            self._path_helper = path_helper
            self._supplied_register_path = supplied_register_path
github standardebooks / tools / se / se_epub.py View on Github external
def set_release_timestamp(self) -&gt; None:
		"""
		If this ebook has not yet been released, set the first release timestamp in the metadata file.
		"""

		if "1900-01-01T00:00:00Z" in self.metadata_xhtml:
			now = datetime.datetime.utcnow()
			now_iso = regex.sub(r"\.[0-9]+$", "", now.isoformat()) + "Z"
			now_iso = regex.sub(r"\+.+?Z$", "Z", now_iso)
			now_friendly = "{0:%B %e, %Y, %l:%M&nbsp;<abbr class="\&quot;time">%p</abbr>}".format(now)
			now_friendly = regex.sub(r"\s+", " ", now_friendly).replace("AM", "a.m.").replace("PM", "p.m.").replace(" [^&lt;]+?", "{}".format(now_iso), self.metadata_xhtml)
			self.metadata_xhtml = regex.sub(r"[^&lt;]+?", "{}".format(now_iso), self.metadata_xhtml)

			with open(self.metadata_file_path, "w", encoding="utf-8") as file:
				file.seek(0)
				file.write(self.metadata_xhtml)
				file.truncate()

			self._metadata_tree = None

			with open(self.path / "src" / "epub" / "text" / "colophon.xhtml", "r+", encoding="utf-8") as file:
				xhtml = file.read()
				xhtml = xhtml.replace("<b>January 1, 1900, 12:00&nbsp;<abbr class="\&quot;time">a.m.</abbr></b>", "<b>{}</b>".format(now_friendly))

				file.seek(0)
				file.write(xhtml)
				file.truncate()
github mgrankin / ru_transformers / yt_encoder.py View on Github external
def decode(self, tokens): # I hate regexps
        if not isinstance(tokens,list):
            tokens = tokens.tolist()
        result = self.bpe.decode(tokens)[0]
        result = re.sub(r'( )?(&lt;\|n\|&gt;)( )?', r'\n', result)
        result = re.sub(r'([\n(]) (\w)',r'\g&lt;1&gt;\g&lt;2&gt;', result)
        result = re.sub(r'(\W)([«"''\n(]|^) (\w)',r'\g&lt;1&gt;\g&lt;2&gt;\g&lt;3&gt;', result)
        result = re.sub(r'(\w)- (\w)',r'\g&lt;1&gt;-\g&lt;2&gt;', result)
        return result
github abuccts / wikt2pron / pywiktionary / IPA / fr_pron.py View on Github external
# (2) -ill- after a vowel; repeat if necessary in case of VillVill
    #     sequence (ailloille respelling of ayoye)
    text = sub_repeatedly("(" + vowel_c + ")ill", r"\1j", text)
    # (3) any other ill, except word-initially (illustrer etc.)
    text = re.sub("([^⁀])ill", r"\1ij", text)
    # (4) final -il after a vowel; we consider final -Cil to contain a
    #     pronounced /l/ (e.g. 'il', 'fil', 'avril', 'exil', 'volatil', 'profil')
    text = re.sub("(" + vowel_c + ")il([⁀‿])", r"\1j\2", text)
    # (5) -il- after a vowel, before a consonant (not totally necessary;
    #     unlikely to occur normally, respelling can use -ill-)
    text = re.sub("(" + vowel_c + ")il(" + cons_c + ")", r"\1j\2", text)

    # y; include before removing final -e so we can distinguish -ay from
    # -aye
    text = re.sub("ay([⁀‿])", r"ai\1", text) # Gamay
    text = re.sub("éy", "éj", text) # used in respellings, eqv. to 'éill'
    text = re.sub("(" + vowel_no_i_c + ")y", r"\1iy", text)
    text = re.sub("yi([" + vowel + ".])", r"y.y\1", text)
    text = re.sub("'y‿", "'j‿", text) # il n'y‿a
    text = re.sub("(" + cons_c + ")y(" + cons_c + ")", r"\1i\2", text)
    text = re.sub("(" + cons_c + ")ye?([⁀‿])", r"\1i\2", text)
    text = re.sub("⁀y(" + cons_c + ")", r"⁀i\1", text)
    text = re.sub("⁀y⁀", "⁀i⁀", text)
    text = re.sub("y", "j", text)

    # nasal hacks
    # make 'n' before liaison in certain cases both nasal and pronounced
    text = re.sub("(⁀[mts]?on)‿", r"\1N‿", text) # mon, son, ton, on
    text = re.sub("('on)‿", r"\1N‿", text) # qu'on, l'on
    text = re.sub("([eu]n)‿", r"\1N‿", text) # en, bien, un, chacun etc.
    # in bon, certain etc. the preceding vowel isn't nasal
    text = re.sub("n‿", "N‿", text)
github mbakeranalecta / sam / samparser.py View on Github external
def parse_insert(insert, ref):
    insert_type = None
    ref_type = None
    item = None

    if insert:
        insert_parts = insert.partition(' ')
        insert_type = insert_parts[0]
        item = insert_parts[2].strip()
        # strip unnecessary quotes from insert item
        item = re.sub(r'^(["\'])|(["\'])$', '', item)
        if item == '':
            raise SAMParserStructureError("Insert item not specified in: {0}".format(insert))
    elif ref:
        if ref[0] == '$':
            item = ref[1:]
            ref_type = 'stringref'
        elif ref[0] == '*':
            item = ref[1:]
            ref_type = 'idref'
        elif ref[0] == '#':
            item = ref[1:]
            ref_type = 'nameref'
        elif ref[0] == '%':
            item = ref[1:]
            ref_type = 'keyref'
    else:
github abuccts / wikt2pron / pywiktionary / IPA / ru_pron.py View on Github external
# 2. remaining geminate n after the stress between vowels
            pron = sub_repeatedly("(" + stress_accents + ".*?" + vowels + accents + "?n)ː(" + vowels + ")", r"\1(ː)\2", pron)
            # 3. remaining ž and n between vowels
            pron = sub_repeatedly("(" + vowels + accents + "?[žn])ː(" + vowels + ")", r"\1ˑ\2", pron)
            # 4. ssk (and zsk, already normalized) immediately after the stress
            pron = re.sub("(" + vowels + stress_accents + "[^" + vow + "]*s)ː(k)", r"\1ˑ\2", pron)
            # 5. eliminate remaining gemination, except for ɕː and ӂː
            pron = re.sub("([^ɕӂ\(\)])ː", r"\1", pron)
            # 6. convert special gemination symbol ˑ to regular gemination
            pron = re.sub("ˑ", "ː", pron)

        # handle soft and hard signs, assimilative palatalization
        # 1. insert j before i when required
        pron = re.sub("ʹi", "ʹji", pron)
        # 2. insert glottal stop after hard sign if required
        pron = re.sub("ʺ([aɛiouy])", r"ʔ\1", pron)
        # 3. (ь) indicating optional palatalization
        pron = re.sub("\(ʹ\)", "⁽ʲ⁾", pron)
        # 4. assimilative palatalization of consonants when followed by
        #    front vowels or soft sign
        pron = re.sub("([mnpbtdkgfvszxɣrl])([ː()]*[eiäạëöüʹ])", r"\1ʲ\2", pron)
        pron = re.sub("([cĵ])([ː()]*[äạöüʹ])", r"\1ʲ\2", pron)
        # 5. remove hard and soft signs
        pron = re.sub("[ʹʺ]", "", pron)

        # reduction of unstressed word-final -я, -е; but special-case
        # unstressed не, же. Final -я always becomes [ə]; final -е may
        # become [ə], [e], [ɪ] or [ɨ] depending on the part of speech and
        # the preceding consonants/vowels.
        pron = re.sub("[äạ]⁀", "ə⁀", pron)
        pron = re.sub("⁀nʲe⁀", "⁀nʲi⁀", pron)
        pron = re.sub("⁀že⁀", "⁀žy⁀", pron)