How to use the regex.DOTALL function in regex

To help you get started, we’ve selected a few regex examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github facelessuser / Rummage / tests / test_rumcore.py View on Github external
rc._regex_pattern(
                r"test",
                rc.DOTALL | rc.IGNORECASE | rc.MULTILINE | rc.WORD |
                rc.BESTMATCH | rc.ENHANCEMATCH | rc.REVERSE | rc.FULLCASE | rc.POSIX
            ).flags,
            regex.V0 | regex.ASCII | regex.DOTALL | regex.IGNORECASE | regex.MULTILINE |
            regex.WORD | regex.ENHANCEMATCH | regex.BESTMATCH | regex.REVERSE | regex.FULLCASE |
            regex.POSIX
        )
        self.assertEqual(
            rc._regex_pattern(
                r"test",
                rc.UNICODE | rc.DOTALL | rc.IGNORECASE | rc.MULTILINE | rc.FULLCASE |
                rc.WORD | rc.BESTMATCH | rc.ENHANCEMATCH | rc.REVERSE | rc.VERSION1 | rc.POSIX
            ).flags,
            regex.V1 | regex.UNICODE | regex.DOTALL | regex.IGNORECASE | regex.MULTILINE |
            regex.WORD | regex.ENHANCEMATCH | regex.BESTMATCH | regex.REVERSE | regex.FULLCASE |
            regex.POSIX
        )
github aamini / introtodeeplearning_labs / lab1 / util.py View on Github external
def extract_song_snippet(generated_text):
    pattern = '\n\n(.*?)\n\n'
    search_results = re.findall(pattern, generated_text, overlapped=True, flags=re.DOTALL)
    songs = [song for song in search_results]
    print "Found {} possible songs in generated texts".format(len(songs))
    return songs
github berkmancenter / mediacloud-sentence-splitter / sentence_splitter / __init__.py View on Github external
"Non-breaking prefix file for language '{}' was not found at path '{}'".format(
                    language,
                    non_breaking_prefix_file,
                ))

        self.__non_breaking_prefixes = dict()
        with open(non_breaking_prefix_file, mode='r', encoding='utf-8') as prefix_file:
            for line in prefix_file.readlines():

                if '#NUMERIC_ONLY#' in line:
                    prefix_type = SentenceSplitter.PrefixType.NUMERIC_ONLY
                else:
                    prefix_type = SentenceSplitter.PrefixType.DEFAULT

                # Remove comments
                line = regex.sub(pattern=r'#.*', repl='', string=line, flags=regex.DOTALL | regex.UNICODE)

                line = line.strip()

                if not line:
                    continue

                self.__non_breaking_prefixes[line] = prefix_type
github lziad / WhitePhosphorus-bot / src / cs1language.py View on Github external
import regex
from . import botsite
from .core import check, EditQueue
from .botsite import cur_timestamp, get_summary


# This module is called once an hour.
MAX_WORK_PER_HOUR = 50
LAST_SORT_KEY = None

tar_template = '[Cc]ite |[Cc]itation'
tar_para = 'language'
para_re = regex.compile(r'(?P{{\s*(%s)(?:(?!{{|}}).)*?(?P{{(?:(?!{{).)*?(?&nest)?(?:(?!}}).)*?}})*'
                        '(?:(?!{{|}}).)*?\|\s*(%s)\s*=\s*)(?P.*?)'
                        '(?P\s*(\|\s*(?:(?!{{|}}).)*(?&nest)*(?:(?!{{|}}).)*?)?}})' %
                        (tar_template, tar_para), regex.DOTALL)

sub_dict = {
    r'阿拉伯[语語文]|Arabic': 'ar',
    r'保加利亚[语文]|保加利亞[語文]|Bulgarian': 'bg',
    r'波斯尼亚[语文]|波士尼亚[語文]|Bosnian': 'bs',
    r'加泰罗尼亚[语文]|加泰羅尼亞[語文]|Catalan': 'ca',
    r'捷克[语語文]|Czech': 'cs',
    r'丹麦[语文]|丹麥[語文]|Danish': 'da',
    r'德[语語文]|Germany?|Deutsch|de-DE': 'de',
    r'希腊[语文]|希臘[語文]|Greek': 'el',
    r'英[语語文]|English|en-(UK|IN)|\[\[English language(\|English)?\]\]': 'en',
    r'西班牙[语語文]|Spanish|español|\[\[西班牙語(\|Spanish)?\]\]': 'es',
    r'爱沙尼亚[语文]|愛沙尼亞[語文]|Estonian': 'et',
    r'波斯[语語文]|Persian': 'fa',
    r'芬兰[语文]|芬蘭[語文]|Finnish': 'fi',
    r'法[语語文]|French|Français|fr-FR|\[\[French language(\|French)?\]\]|\{\{fr icon\}\}': 'fr',
github fnl / segtok / segtok / segmenter.py View on Github external
|   from
|   has
|   i(?: nto|s )
|   o[fr]
|   t(?: han|hat|hrough )
|   via
|   w(?: as|ere|hether|ith )
)\b""", UNICODE | VERBOSE)
"Lower-case words that in the given form usually don't start a sentence."

BEFORE_LOWER = compile(r""" .*?
(?: [%s]"[\)\]]*           # ."]) .") ."
|   [%s] [\)\]]+           # .]) .)
|   \b spp \.              # spp.  (species pluralis)
|   \b \p{L} \p{Ll}? \.    # Ll. L.
) \s+ $""" % (SENTENCE_TERMINALS, SENTENCE_TERMINALS), DOTALL | UNICODE | VERBOSE
)
"""
Endings that, if followed by a lower-case word, are not sentence terminals:
- Quotations and brackets ("Hello!" said the man.)
- dotted abbreviations (U.S.A. was)
- genus-species-like (m. musculus)
"""
LOWER_WORD = compile(r'^\p{Ll}+[%s]?\p{Ll}*\b' % HYPHENS, UNICODE)
"Lower-case words are not sentence starters (after an abbreviation)."

MIDDLE_INITIAL_END = compile(r'\b\p{Lu}\p{Ll}+\W+\p{Lu}$', UNICODE)
"Upper-case initial after upper-case word at the end of a string."

UPPER_WORD_START = compile(r'^\p{Lu}\p{Ll}+\b', UNICODE)
"Upper-case word at the beginning of a string."
github skoczen / will / will / backends / generation / fuzzy_all_matches.py View on Github external
if not hasattr(self, "cached_regex"):
            self.cached_regex = {}

        method_path = method_meta["plugin_info"]["parent_path"]
        if not method_path in self.cached_regex:

            regex_string = method_meta["regex_pattern"]
            if "case_sensitive" in method_meta and not method_meta["case_sensitive"]:
                regex_string = "(?i)%s" % regex_string

            if method_meta["multiline"]:
                try:
                    self.cached_regex[method_path] = regex.compile("%s{e<=%s}" % (
                        regex_string,
                        settings.FUZZY_REGEX_ALLOWABLE_ERRORS
                    ), regex.MULTILINE | regex.DOTALL | regex.ENHANCEMATCH)
                except:
                    self.cached_regex[method_path] = regex.compile("%s{e<=%s}" % (
                        regex.escape(regex_string),
                        settings.FUZZY_REGEX_ALLOWABLE_ERRORS
                    ), regex.MULTILINE | regex.DOTALL | regex.ENHANCEMATCH)
            else:
                try:
                    self.cached_regex[method_path] = regex.compile("%s{e<=%s}" % (
                        regex_string,
                        settings.FUZZY_REGEX_ALLOWABLE_ERRORS
                    ), regex.ENHANCEMATCH)
                except:
                    self.cached_regex[method_path] = regex.compile("%s{e<=%s}" % (
                        regex.escape(regex_string),
                        settings.FUZZY_REGEX_ALLOWABLE_ERRORS
                    ), regex.ENHANCEMATCH)
github materialsintelligence / mat2vec / mat2vec / processing / process.py View on Github external
"Scm−1", "Acm−1", "eV−1cm−2", "cm-2", "sccm", "cm−2eV−1", "cm−3eV−1",
                   "kA", "s−1", "emu", "L", "cmHz1", "gmol−1", "kVcm−1", "MPam1",
                   "cm2V−1s−1", "Acm−2", "cm−2s−1", "MV", "ionscm−2", "Jcm−2", "ncm−2",
                   "Jcm−2", "Wcm−2", "GWcm−2", "Acm−2K−2", "gcm−3", "cm3g−1", "mgl−1",
                   "mgml−1", "mgcm−2", "mΩcm", "cm−2s−1", "cm−2", "ions", "moll−1",
                   "nmol", "psi", "mol·L−1", "Jkg−1K−1", "km", "Wm−2", "mass", "mmHg",
                   "mmmin−1", "GeV", "m−2", "m−2s−1", "Kmin−1", "gL−1", "ng", "hr", "w",
                   "mN", "kN", "Mrad", "rad", "arcsec", "Ag−1", "dpa", "cdm−2",
                   "cd", "mcd", "mHz", "m−3", "ppm", "phr", "mL", "ML", "mlmin−1", "MWm−2",
                   "Wm−1K−1", "Wm−1K−1", "kWh", "Wkg−1", "Jm−3", "m-3", "gl−1", "A−1",
                   "Ks−1", "mgdm−3", "mms−1", "ks", "appm", "ºC", "HV", "kDa", "Da", "kG",
                   "kGy", "MGy", "Gy", "mGy", "Gbps", "μB", "μL", "μF", "nF", "pF", "mF",
                   "A", "Å", "A˚", "μgL−1"]

    NR_BASIC = regex.compile(r"^[+-]?\d*\.?\d+\(?\d*\)?+$", regex.DOTALL)
    NR_AND_UNIT = regex.compile(r"^([+-]?\d*\.?\d+\(?\d*\)?+)([\p{script=Latin}|Ω|μ]+.*)", regex.DOTALL)

    PUNCT = list(string.punctuation) + ["\"", "“", "”", "≥", "≤", "×"]

    def __init__(self, phraser_path=PHRASER_PATH):
        self.elem_name_dict = {en: es for en, es in zip(self.ELEMENT_NAMES, self.ELEMENTS)}
        self.phraser = Phraser.load(phraser_path)

    def tokenize(self, text, split_oxidation=True, keep_sentences=True):
        """Converts a string to a list tokens (words) using a modified chemdataextractor tokenizer.

        Adds a few fixes for inorganic materials science, such as splitting common units from numbers
        and splitting the valence state.

        Args:
            text: input text as a string
            split_oxidation: if True, will split the oxidation state from the element, e.g. iron(II)
github microsoft / Recognizers-Text / Python / libraries / recognizers-number / recognizers_number / number / extractors.py View on Github external
def generate_regexes(self, ignore_case: bool = False) -> List[Pattern]:
        definitions = self.get_definitions()
        options = regex.DOTALL | (regex.IGNORECASE if ignore_case else 0)
        return list(map(lambda d: RegExpUtility.get_safe_reg_exp(d, options),
                        definitions))
github microsoft / presidio / presidio-analyzer / analyzer / predefined_recognizers / iban_recognizer.py View on Github external
def __is_valid_format(iban):
        country_code = iban[:2]
        if country_code in regex_per_country:
            country_regex = regex_per_country[country_code]
            return country_regex and re.match(country_regex, iban,
                                              flags=re.DOTALL | re.MULTILINE)

        return False
github materialsintelligence / matscholar / matscholar / process.py View on Github external
'MΩ', 'Ω', 'kΩ', 'mΩ', 'mgL−1', 'moldm−3', 'm2', 'm3', 'cm-1', 'cm',
                   'Scm−1', 'Acm−1', 'eV−1cm−2', 'cm-2', 'sccm', 'cm−2eV−1', 'cm−3eV−1',
                   'kA', 's−1', 'emu', 'L', 'cmHz1', 'gmol−1', 'kVcm−1', 'MPam1',
                   'cm2V−1s−1', 'Acm−2', 'cm−2s−1', 'MV', 'ionscm−2', 'Jcm−2', 'ncm−2',
                   'Jcm−2', 'Wcm−2', 'GWcm−2', 'Acm−2K−2', 'gcm−3', 'cm3g−1', 'mgl−1',
                   'mgml−1', 'mgcm−2', 'mΩcm', 'cm−2s−1', 'cm−2', 'ions', 'moll−1',
                   'nmol', 'psi', 'mol·L−1', 'Jkg−1K−1', 'km', 'Wm−2', 'mass', 'mmHg',
                   'mmmin−1', 'GeV', 'm−2', 'm−2s−1', 'Kmin−1', 'gL−1', 'ng', 'hr', 'w',
                   'mN', 'kN', 'Mrad', 'rad', 'arcsec', 'Ag−1', 'dpa', 'cdm−2',
                   'cd', 'mcd', 'mHz', 'm−3', 'ppm', 'phr', 'mL', 'ML', 'mlmin−1', 'MWm−2',
                   'Wm−1K−1', 'Wm−1K−1', 'kWh', 'Wkg−1', 'Jm−3', 'm-3', 'gl−1', 'A−1',
                   'Ks−1', 'mgdm−3', 'mms−1', 'ks', 'appm', 'ºC', 'HV', 'kDa', 'Da', 'kG',
                   'kGy', 'MGy', 'Gy', 'mGy', 'Gbps', 'μB', 'μL', 'μF', 'nF', 'pF', 'mF',
                   'A', 'Å', 'A˚', "μgL−1"]

    NR_BASIC = regex.compile(r'^[+-]?\d*\.?\d+\(?\d*\)?+$', regex.DOTALL)
    NR_AND_UNIT = regex.compile(r'^([+-]?\d*\.?\d+\(?\d*\)?+)([\p{script=Latin}|Ω|μ]+.*)', regex.DOTALL)

    PUNCT = list(string.punctuation) + ['"', '“', '”', '≥', '≤', '×']

    def __init__(self, phraser_path=PHRASER_PATH):
        self.elem_name_dict = {en: es for en, es in zip(self.ELEMENT_NAMES, self.ELEMENTS)}
        self.phraser = Phraser.load(phraser_path)

    def tokenize(self, text, split_oxidation=True, keep_sentences=True):
        """
        Converts string to a list tokens (words) using chemdataextractor tokenizer, with a couple of fixes
        for inorganic materials science.
        Keeps the structure of sentences.
        :param text: input text as a string
        :param split_oxidation: if True, will split the oxidation state from the element, e.g. iron(II)
        will become iron (II), same with Fe(II), etc.