Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def encode(self, text):
bpe_tokens = []
for token in re.findall(self.pat, text):
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
return bpe_tokens
# Check for stage direction that ends in ?! but also has a trailing period
matches = regex.findall(r"<i>(?:(?![,:;!?]", file_contents)
if matches:
messages.append(LintMessage("Stage direction ending in period next to other punctuation. Remove trailing periods in stage direction.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))
# Check for ending punctuation inside italics
matches = regex.findall(r"(<([ib]) epub:type=\"se:[^\"]+?\">[^<]+?[\.,\!\?])", file_contents)
if matches:
messages.append(LintMessage("Ending punctuation inside italics.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match[0], se.MESSAGE_TYPE_WARNING, filename, True))
# Check for money not separated by commas
matches = regex.findall(r"[£\$][0-9]{4,}", file_contents)
if matches:
messages.append(LintMessage("Numbers not grouped by commas. Separate numbers greater than 1,000 with commas at every three numerals.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))
# Check for trailing commas inside <i> tags at the close of dialog
if ",</i>”" in file_contents:
messages.append(LintMessage("Comma inside <i> tag before closing dialog. (Search for ,</i>”)", se.MESSAGE_TYPE_WARNING, filename))
# Check for period following Roman numeral, which is an old-timey style we must fix
# But ignore the numeral if it's the first item in a <p> tag, as that suggests it might be a kind of list item.
matches = regex.findall(r"(?]*?>)<span>[^<]+?</span>\.\s+[a-z]", file_contents)
if matches:
messages.append(LintMessage("Roman numeral followed by a period. When in mid-sentence Roman numerals must not be followed by a period.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))</p></i>
def y_close(self):
yesterday_close = newlist[4]
yesterday_close_float = round(float(regex.findall("\d+.\d{1,4}", yesterday_close)[0]),3)
return yesterday_close_float
matches = regex.findall(r"<([a-z0-9]+)[^>]*?>\s*(<span>[^<]+?</span>)\s*", file_contents, flags=regex.DOTALL)
if matches:
messages.append(LintMessage("If <span> exists only for the z3998:roman semantic, then z3998:roman should be pulled into parent tag instead.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match[1], se.MESSAGE_TYPE_WARNING, filename, True))
# Check for "Hathi Trust" instead of "HathiTrust"
if "Hathi Trust" in file_contents:
messages.append(LintMessage("\"Hathi Trust\" should be \"HathiTrust\"", se.MESSAGE_TYPE_ERROR, filename))
# Check for uppercase letters in IDs or classes
matches ="[id],[class]")
for match in matches:
if match.has_attr("id"):
normalized_id = unicodedata.normalize("NFKD", match["id"])
uppercase_matches = regex.findall(r"[A-Z]", normalized_id)
for _ in uppercase_matches:
messages.append(LintMessage("Uppercase ID attribute: {}. Attribute values must be all lowercase.".format(match["id"]), se.MESSAGE_TYPE_ERROR, filename))
number_matches = regex.findall(r"^[0-9]", normalized_id)
for _ in number_matches:
messages.append(LintMessage("ID starting with a number is illegal XHTML: {}".format(match["id"]), se.MESSAGE_TYPE_ERROR, filename))
if match.has_attr("class"):
for css_class in match["class"]:
uppercase_matches = regex.findall(r"[A-Z]", unicodedata.normalize("NFKD", css_class))
for _ in uppercase_matches:
messages.append(LintMessage("Uppercase class attribute: {}. Attribute values must be all lowercase.".format(css_class), se.MESSAGE_TYPE_ERROR, filename))
matches = [x for x in"section") if not x.has_attr("id")]
if matches:
messages.append(LintMessage("<section> element without id attribute.", se.MESSAGE_TYPE_ERROR, filename))</section></span>
le_except = ["whole", "mobile", "pole", "male", "female", "hale", "pale", "tale", "sale", "aisle", "whale", "while"]
if word[-1:] == "e":
if word[-2:] == "le" and word not in le_except:
disc += 1
# 4) check if consecutive vowels exists, triplets or pairs, count them as one.
double_and_triple = len(regex.findall(r"[eaoui][eaoui]", word))
tripple = len(regex.findall(r"[eaoui][eaoui][eaoui]", word))
disc += double_and_triple + tripple
# 5) count remaining vowels in word.
num_vowels = len(regex.findall(r"[eaoui]", word))
# 6) add one if starts with "mc"
if word[:2] == "mc":
syls += 1
# 7) add one if ends with "y" but is not surrouned by vowel
if word[-1:] == "y" and word[-2] not in "aeoui":
syls += 1
# 8) add one if "y" is surrounded by non-vowels and is not in the last word.
for i, j in enumerate(word):
if j == "y":
if (i != 0) and (i != len(word) - 1): # pylint: disable=consider-using-in
if word[i - 1] not in "aeoui" and word[i + 1] not in "aeoui":
syls += 1
def get_content_files(self) -> list:
Reads the spine from content.opf to obtain a list of content files, in the order wanted for the ToC.
It assumes this has already been manually ordered by the producer.
list of content files in the order given in the spine in content.opf
return regex.findall(r"", self.metadata_xhtml)
# Remove HTML tags
xhtml = regex.sub(r"<title>.+?</title>", " ", xhtml)
xhtml = regex.sub(r"<.+?>", " ", xhtml, flags=regex.DOTALL)
# Replace some formatting characters
xhtml = regex.sub(r"[…–—― ‘’“”\{\}\(\)]", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL)
# Remove word-connecting dashes, apostrophes, commas, and slashes (and/or), they count as a word boundry but they shouldn't
xhtml = regex.sub(r"[a-z0-9][\-\'\,\.\/][a-z0-9]", "aa", xhtml, flags=regex.IGNORECASE | regex.DOTALL)
# Replace sequential spaces with one space
xhtml = regex.sub(r"\s+", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL)
# Get the word count
return len(regex.findall(r"\b\w+\b", xhtml, flags=regex.IGNORECASE | regex.DOTALL))
For speed, don't import until here.
global re
if re is None:
import regex as re
if tokenization_regex is None:
# by default, use the big regex.
global bigregex
if bigregex==None:
bigregex = wordRegex()
tokenization_regex = bigregex
#return bigregex
self.tokens = re.findall(tokenization_regex,self.string)
return self.tokens
def extract_dsa_private(self):
"""Extract DSA private key
Chepy: The Chepy object.
self.state = re.findall(
r"-----BEGIN DSA PRIVATE KEY-----", self._convert_to_str()
return self
messages.append(LintMessage("Long description must be escaped HTML.", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for HTML entities in long-description, but allow &amp;
if"&[a-z]+?;", metadata_xhtml.replace("&amp;", "")):
messages.append(LintMessage("HTML entites detected in metadata. Use Unicode equivalents instead.", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for illegal em-dashes in
if"[^<]+?—[^<]+?", metadata_xhtml) is not None:
messages.append(LintMessage("Illegal em-dash detected in dc:subject; use --", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for empty production notes
if "Any special notes about the production of this ebook for future editors/producers? Remove this element if not." in metadata_xhtml:
messages.append(LintMessage("Empty production-notes element in metadata.", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for illegal VCS URLs
matches = regex.findall(r"([^<]+?)", metadata_xhtml)
if matches:
for match in matches:
if not match.startswith(""):
messages.append(LintMessage("Illegal se:url.vcs.github. VCS URLs must begin with {}".format(match), se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for HathiTrust scan URLs instead of actual record URLs
if "" in metadata_xhtml or "" in metadata_xhtml:
messages.append(LintMessage("Use HathiTrust record URLs, not page scan URLs, in metadata, imprint, and colophon. Record URLs look like:", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for illegal se:subject tags
matches = regex.findall(r"([^<]+?)", metadata_xhtml)
if matches:
for match in matches:
if match not in se.SE_GENRES:
messages.append(LintMessage("Illegal se:subject: {}".format(match), se.MESSAGE_TYPE_ERROR, "content.opf"))