Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def encode(self, text):
bpe_tokens = []
for token in re.findall(self.pat, text):
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
return bpe_tokens
# Check for stage direction that ends in ?! but also has a trailing period
matches = regex.findall(r"<i>(?:(?![,:;!?]", file_contents)
if matches:
messages.append(LintMessage("Stage direction ending in period next to other punctuation. Remove trailing periods in stage direction.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))
# Check for ending punctuation inside italics
matches = regex.findall(r"(<([ib]) epub:type=\"se:[^\"]+?\">[^<]+?[\.,\!\?])", file_contents)
if matches:
messages.append(LintMessage("Ending punctuation inside italics.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match[0], se.MESSAGE_TYPE_WARNING, filename, True))
# Check for money not separated by commas
matches = regex.findall(r"[£\$][0-9]{4,}", file_contents)
if matches:
messages.append(LintMessage("Numbers not grouped by commas. Separate numbers greater than 1,000 with commas at every three numerals.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))
# Check for trailing commas inside <i> tags at the close of dialog
if ",</i>”" in file_contents:
messages.append(LintMessage("Comma inside <i> tag before closing dialog. (Search for ,</i>”)", se.MESSAGE_TYPE_WARNING, filename))
# Check for period following Roman numeral, which is an old-timey style we must fix
# But ignore the numeral if it's the first item in a <p> tag, as that suggests it might be a kind of list item.
matches = regex.findall(r"(?]*?>)<span>[^<]+?</span>\.\s+[a-z]", file_contents)
if matches:
messages.append(LintMessage("Roman numeral followed by a period. When in mid-sentence Roman numerals must not be followed by a period.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match, se.MESSAGE_TYPE_WARNING, filename, True))</p></i>
def y_close(self):
yesterday_close = newlist[4]
yesterday_close_float = round(float(regex.findall("\d+.\d{1,4}", yesterday_close)[0]),3)
return yesterday_close_float
matches = regex.findall(r"<([a-z0-9]+)[^>]*?>\s*(<span>[^<]+?</span>)\s*", file_contents, flags=regex.DOTALL)
if matches:
messages.append(LintMessage("If <span> exists only for the z3998:roman semantic, then z3998:roman should be pulled into parent tag instead.", se.MESSAGE_TYPE_WARNING, filename))
for match in matches:
messages.append(LintMessage(match[1], se.MESSAGE_TYPE_WARNING, filename, True))
# Check for "Hathi Trust" instead of "HathiTrust"
if "Hathi Trust" in file_contents:
messages.append(LintMessage("\"Hathi Trust\" should be \"HathiTrust\"", se.MESSAGE_TYPE_ERROR, filename))
# Check for uppercase letters in IDs or classes
matches = dom.select("[id],[class]")
for match in matches:
if match.has_attr("id"):
normalized_id = unicodedata.normalize("NFKD", match["id"])
uppercase_matches = regex.findall(r"[A-Z]", normalized_id)
for _ in uppercase_matches:
messages.append(LintMessage("Uppercase ID attribute: {}. Attribute values must be all lowercase.".format(match["id"]), se.MESSAGE_TYPE_ERROR, filename))
number_matches = regex.findall(r"^[0-9]", normalized_id)
for _ in number_matches:
messages.append(LintMessage("ID starting with a number is illegal XHTML: {}".format(match["id"]), se.MESSAGE_TYPE_ERROR, filename))
if match.has_attr("class"):
for css_class in match["class"]:
uppercase_matches = regex.findall(r"[A-Z]", unicodedata.normalize("NFKD", css_class))
for _ in uppercase_matches:
messages.append(LintMessage("Uppercase class attribute: {}. Attribute values must be all lowercase.".format(css_class), se.MESSAGE_TYPE_ERROR, filename))
matches = [x for x in dom.select("section") if not x.has_attr("id")]
if matches:
messages.append(LintMessage("<section> element without id attribute.", se.MESSAGE_TYPE_ERROR, filename))</section></span>
le_except = ["whole", "mobile", "pole", "male", "female", "hale", "pale", "tale", "sale", "aisle", "whale", "while"]
if word[-1:] == "e":
if word[-2:] == "le" and word not in le_except:
pass
else:
disc += 1
# 4) check if consecutive vowels exists, triplets or pairs, count them as one.
double_and_triple = len(regex.findall(r"[eaoui][eaoui]", word))
tripple = len(regex.findall(r"[eaoui][eaoui][eaoui]", word))
disc += double_and_triple + tripple
# 5) count remaining vowels in word.
num_vowels = len(regex.findall(r"[eaoui]", word))
# 6) add one if starts with "mc"
if word[:2] == "mc":
syls += 1
# 7) add one if ends with "y" but is not surrouned by vowel
if word[-1:] == "y" and word[-2] not in "aeoui":
syls += 1
# 8) add one if "y" is surrounded by non-vowels and is not in the last word.
for i, j in enumerate(word):
if j == "y":
if (i != 0) and (i != len(word) - 1): # pylint: disable=consider-using-in
if word[i - 1] not in "aeoui" and word[i + 1] not in "aeoui":
syls += 1
def get_content_files(self) -> list:
"""
Reads the spine from content.opf to obtain a list of content files, in the order wanted for the ToC.
It assumes this has already been manually ordered by the producer.
INPUTS:
None
OUTPUTS:
list of content files in the order given in the spine in content.opf
"""
return regex.findall(r"", self.metadata_xhtml)
# Remove HTML tags
xhtml = regex.sub(r"<title>.+?</title>", " ", xhtml)
xhtml = regex.sub(r"<.+?>", " ", xhtml, flags=regex.DOTALL)
# Replace some formatting characters
xhtml = regex.sub(r"[…–—― ‘’“”\{\}\(\)]", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL)
# Remove word-connecting dashes, apostrophes, commas, and slashes (and/or), they count as a word boundry but they shouldn't
xhtml = regex.sub(r"[a-z0-9][\-\'\,\.\/][a-z0-9]", "aa", xhtml, flags=regex.IGNORECASE | regex.DOTALL)
# Replace sequential spaces with one space
xhtml = regex.sub(r"\s+", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL)
# Get the word count
return len(regex.findall(r"\b\w+\b", xhtml, flags=regex.IGNORECASE | regex.DOTALL))
except:
"""
For speed, don't import until here.
"""
tokenization_regex=self.tokenization_regex
global re
if re is None:
import regex as re
if tokenization_regex is None:
# by default, use the big regex.
global bigregex
if bigregex==None:
bigregex = wordRegex()
tokenization_regex = bigregex
#return bigregex
self.tokens = re.findall(tokenization_regex,self.string)
return self.tokens
def extract_dsa_private(self):
"""Extract DSA private key
Returns:
Chepy: The Chepy object.
"""
self.state = re.findall(
r"-----BEGIN DSA PRIVATE KEY-----", self._convert_to_str()
)
return self
messages.append(LintMessage("Long description must be escaped HTML.", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for HTML entities in long-description, but allow &amp;
if regex.search(r"&[a-z]+?;", metadata_xhtml.replace("&amp;", "")):
messages.append(LintMessage("HTML entites detected in metadata. Use Unicode equivalents instead.", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for illegal em-dashes in
if regex.search(r"[^<]+?—[^<]+?", metadata_xhtml) is not None:
messages.append(LintMessage("Illegal em-dash detected in dc:subject; use --", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for empty production notes
if "Any special notes about the production of this ebook for future editors/producers? Remove this element if not." in metadata_xhtml:
messages.append(LintMessage("Empty production-notes element in metadata.", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for illegal VCS URLs
matches = regex.findall(r"([^<]+?)", metadata_xhtml)
if matches:
for match in matches:
if not match.startswith("https://github.com/standardebooks/"):
messages.append(LintMessage("Illegal se:url.vcs.github. VCS URLs must begin with https://github.com/standardebooks/: {}".format(match), se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for HathiTrust scan URLs instead of actual record URLs
if "babel.hathitrust.org" in metadata_xhtml or "hdl.handle.net" in metadata_xhtml:
messages.append(LintMessage("Use HathiTrust record URLs, not page scan URLs, in metadata, imprint, and colophon. Record URLs look like: https://catalog.hathitrust.org/Record/", se.MESSAGE_TYPE_ERROR, "content.opf"))
# Check for illegal se:subject tags
matches = regex.findall(r"([^<]+?)", metadata_xhtml)
if matches:
for match in matches:
if match not in se.SE_GENRES:
messages.append(LintMessage("Illegal se:subject: {}".format(match), se.MESSAGE_TYPE_ERROR, "content.opf"))
else: