How to use the regex.findall function in regex

To help you get started, we’ve selected a few regex examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github minimaxir / gpt-2-simple / gpt_2_simple / src / encoder.py View on Github external
def encode(self, text):
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens
github standardebooks / tools / se / se_epub_lint.py View on Github external
# Check for stage direction that ends in ?! but also has a trailing period
					matches = regex.findall(r"<i>(?:(?![,:;!?]", file_contents)
					if matches:
						messages.append(LintMessage("Stage direction ending in period next to other punctuation. Remove trailing periods in stage direction.", se.MESSAGE_TYPE_WARNING, filename))
						for match in matches:
							messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))

					# Check for ending punctuation inside italics
					matches = regex.findall(r"(&lt;([ib]) epub:type=\"se:[^\"]+?\"&gt;[^&lt;]+?[\.,\!\?])", file_contents)
					if matches:
						messages.append(LintMessage("Ending punctuation inside italics.", se.MESSAGE_TYPE_WARNING, filename))
						for match in matches:
							messages.append(LintMessage(match[0], se.MESSAGE_TYPE_WARNING, filename, True))

					# Check for money not separated by commas
					matches = regex.findall(r"[£\$][0-9]{4,}", file_contents)
					if matches:
						messages.append(LintMessage("Numbers not grouped by commas. Separate numbers greater than 1,000 with commas at every three numerals.", se.MESSAGE_TYPE_WARNING, filename))
						for match in matches:
							messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))

					# Check for trailing commas inside <i> tags at the close of dialog
					if ",</i>”" in file_contents:
						messages.append(LintMessage("Comma inside <i> tag before closing dialog. (Search for ,</i>”)", se.MESSAGE_TYPE_WARNING, filename))

					# Check for period following Roman numeral, which is an old-timey style we must fix
					# But ignore the numeral if it's the first item in a <p> tag, as that suggests it might be a kind of list item.
					matches = regex.findall(r"(?]*?&gt;)<span>[^&lt;]+?</span>\.\s+[a-z]", file_contents)
					if matches:
						messages.append(LintMessage("Roman numeral followed by a period. When in mid-sentence Roman numerals must not be followed by a period.", se.MESSAGE_TYPE_WARNING, filename))
						for match in matches:
							messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))</p></i>
github nhsb1 / pivot-point-calculator / ppc.py View on Github external
def y_close(self):
	yesterday_close = newlist[4]
	yesterday_close_float = round(float(regex.findall("\d+.\d{1,4}", yesterday_close)[0]),3)
	return yesterday_close_float
github standardebooks / tools / se / se_epub_lint.py View on Github external
matches = regex.findall(r"&lt;([a-z0-9]+)[^&gt;]*?&gt;\s*(<span>[^&lt;]+?</span>)\s*", file_contents, flags=regex.DOTALL)
					if matches:
						messages.append(LintMessage("If <span> exists only for the z3998:roman semantic, then z3998:roman should be pulled into parent tag instead.", se.MESSAGE_TYPE_WARNING, filename))
						for match in matches:
							messages.append(LintMessage(match[1], se.MESSAGE_TYPE_WARNING, filename, True))

					# Check for "Hathi Trust" instead of "HathiTrust"
					if "Hathi Trust" in file_contents:
						messages.append(LintMessage("\"Hathi Trust\" should be \"HathiTrust\"", se.MESSAGE_TYPE_ERROR, filename))

					# Check for uppercase letters in IDs or classes
					matches = dom.select("[id],[class]")
					for match in matches:
						if match.has_attr("id"):
							normalized_id = unicodedata.normalize("NFKD", match["id"])
							uppercase_matches = regex.findall(r"[A-Z]", normalized_id)
							for _ in uppercase_matches:
								messages.append(LintMessage("Uppercase ID attribute: {}. Attribute values must be all lowercase.".format(match["id"]), se.MESSAGE_TYPE_ERROR, filename))

							number_matches = regex.findall(r"^[0-9]", normalized_id)
							for _ in number_matches:
								messages.append(LintMessage("ID starting with a number is illegal XHTML: {}".format(match["id"]), se.MESSAGE_TYPE_ERROR, filename))

						if match.has_attr("class"):
							for css_class in match["class"]:
								uppercase_matches = regex.findall(r"[A-Z]", unicodedata.normalize("NFKD", css_class))
								for _ in uppercase_matches:
									messages.append(LintMessage("Uppercase class attribute: {}. Attribute values must be all lowercase.".format(css_class), se.MESSAGE_TYPE_ERROR, filename))

					matches = [x for x in dom.select("section") if not x.has_attr("id")]
					if matches:
						messages.append(LintMessage("<section> element without id attribute.", se.MESSAGE_TYPE_ERROR, filename))</section></span>
github standardebooks / tools / se / formatting.py View on Github external
le_except = ["whole", "mobile", "pole", "male", "female", "hale", "pale", "tale", "sale", "aisle", "whale", "while"]

	if word[-1:] == "e":
		if word[-2:] == "le" and word not in le_except:
			pass

		else:
			disc += 1

	# 4) check if consecutive vowels exists, triplets or pairs, count them as one.
	double_and_triple = len(regex.findall(r"[eaoui][eaoui]", word))
	tripple = len(regex.findall(r"[eaoui][eaoui][eaoui]", word))
	disc += double_and_triple + tripple

	# 5) count remaining vowels in word.
	num_vowels = len(regex.findall(r"[eaoui]", word))

	# 6) add one if starts with "mc"
	if word[:2] == "mc":
		syls += 1

	# 7) add one if ends with "y" but is not surrouned by vowel
	if word[-1:] == "y" and word[-2] not in "aeoui":
		syls += 1

	# 8) add one if "y" is surrounded by non-vowels and is not in the last word.
	for i, j in enumerate(word):
		if j == "y":
			if (i != 0) and (i != len(word) - 1): # pylint: disable=consider-using-in
				if word[i - 1] not in "aeoui" and word[i + 1] not in "aeoui":
					syls += 1
github standardebooks / tools / se / se_epub.py View on Github external
def get_content_files(self) -&gt; list:
		"""
		Reads the spine from content.opf to obtain a list of content files, in the order wanted for the ToC.
		It assumes this has already been manually ordered by the producer.

		INPUTS:
		None

		OUTPUTS:
		list of content files in the order given in the spine in content.opf
		"""

		return regex.findall(r"", self.metadata_xhtml)
github standardebooks / tools / se / formatting.py View on Github external
# Remove HTML tags
	xhtml = regex.sub(r"<title>.+?</title>", " ", xhtml)
	xhtml = regex.sub(r"&lt;.+?&gt;", " ", xhtml, flags=regex.DOTALL)

	# Replace some formatting characters
	xhtml = regex.sub(r"[…–—―&nbsp;‘’“”\{\}\(\)]", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL)

	# Remove word-connecting dashes, apostrophes, commas, and slashes (and/or), they count as a word boundry but they shouldn't
	xhtml = regex.sub(r"[a-z0-9][\-\'\,\.\/][a-z0-9]", "aa", xhtml, flags=regex.IGNORECASE | regex.DOTALL)

	# Replace sequential spaces with one space
	xhtml = regex.sub(r"\s+", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL)

	# Get the word count
	return len(regex.findall(r"\b\w+\b", xhtml, flags=regex.IGNORECASE | regex.DOTALL))
github Bookworm-project / BookwormDB / bookwormDB / tokenizer.py View on Github external
except:
            """
            For speed, don't import until here.
            """
            tokenization_regex=self.tokenization_regex
            global re
            if re is None:
                import regex as re
            if tokenization_regex is None:
                # by default, use the big regex.
                global bigregex
                if bigregex==None:
                    bigregex = wordRegex()
                tokenization_regex = bigregex
            #return bigregex
            self.tokens = re.findall(tokenization_regex,self.string)
            return self.tokens
github securisec / chepy / chepy / modules / extractors.py View on Github external
def extract_dsa_private(self):
        """Extract DSA private key

        Returns:
            Chepy: The Chepy object. 
        """
        self.state = re.findall(
            r"-----BEGIN DSA PRIVATE KEY-----", self._convert_to_str()
        )
        return self
github standardebooks / tools / se / se_epub_lint.py View on Github external
messages.append(LintMessage("Long description must be escaped HTML.", se.MESSAGE_TYPE_ERROR, "content.opf"))

	# Check for HTML entities in long-description, but allow &amp;amp;
	if regex.search(r"&amp;[a-z]+?;", metadata_xhtml.replace("&amp;amp;", "")):
		messages.append(LintMessage("HTML entites detected in metadata. Use Unicode equivalents instead.", se.MESSAGE_TYPE_ERROR, "content.opf"))

	# Check for illegal em-dashes in 
	if regex.search(r"[^&lt;]+?—[^&lt;]+?", metadata_xhtml) is not None:
		messages.append(LintMessage("Illegal em-dash detected in dc:subject; use --", se.MESSAGE_TYPE_ERROR, "content.opf"))

	# Check for empty production notes
	if "Any special notes about the production of this ebook for future editors/producers? Remove this element if not." in metadata_xhtml:
		messages.append(LintMessage("Empty production-notes element in metadata.", se.MESSAGE_TYPE_ERROR, "content.opf"))

	# Check for illegal VCS URLs
	matches = regex.findall(r"([^&lt;]+?)", metadata_xhtml)
	if matches:
		for match in matches:
			if not match.startswith("https://github.com/standardebooks/"):
				messages.append(LintMessage("Illegal se:url.vcs.github. VCS URLs must begin with https://github.com/standardebooks/: {}".format(match), se.MESSAGE_TYPE_ERROR, "content.opf"))

	# Check for HathiTrust scan URLs instead of actual record URLs
	if "babel.hathitrust.org" in metadata_xhtml or "hdl.handle.net" in metadata_xhtml:
		messages.append(LintMessage("Use HathiTrust record URLs, not page scan URLs, in metadata, imprint, and colophon. Record URLs look like: https://catalog.hathitrust.org/Record/", se.MESSAGE_TYPE_ERROR, "content.opf"))

	# Check for illegal se:subject tags
	matches = regex.findall(r"([^&lt;]+?)", metadata_xhtml)
	if matches:
		for match in matches:
			if match not in se.SE_GENRES:
				messages.append(LintMessage("Illegal se:subject: {}".format(match), se.MESSAGE_TYPE_ERROR, "content.opf"))
	else: