Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
except:
chapcount = 1
tryurl = "https://%s/s/%s/%d/"%(self.getSiteDomain(),
self.story.getMetadata('storyId'),
chapcount+1)
logger.debug('=Trying newer chapter: %s' % tryurl)
newdata = self._fetchUrl(tryurl)
if "not found. Please check to see you are not using an outdated url." not in newdata \
and "This request takes too long to process, it is timed out by the server." not in newdata:
logger.debug('=======Found newer chapter: %s' % tryurl)
soup = self.make_soup(newdata)
except HTTPError as e:
if e.code == 503:
raise e
except Exception as e:
logger.warn("Caught an exception reading URL: %s Exception %s."%(unicode(url),unicode(e)))
pass
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"^/u/\d+"))
self.story.setMetadata('authorId',a['href'].split('/')[2])
self.story.setMetadata('authorUrl','https://'+self.host+a['href'])
self.story.setMetadata('author',a.string)
## Pull some additional data from html.
## ffnet shows category two ways
## 1) class(Book, TV, Game,etc) >> category(Harry Potter, Sailor Moon, etc)
## 2) cat1_cat2_Crossover
## For 1, use the second link.
## For 2, fetch the crossover page and pull the two categories from there.
def get_cache_post(self,postid):
## saved using original 'post-99999' id for key.
postid=unicode(postid) # thank you, Py3.
if '/posts/' in postid:
## allows chapter urls to be passed in directly.
# assumed normalized to /posts/1234/
postid = "post-"+postid.split('/')[-2]
elif '#post-' in postid:
postid = postid.split('#')[1]
# logger.debug("get cache %s %s"%(postid,postid in self.post_cache))
return self.post_cache.get(postid,None)
time.sleep(sleeptime)
try:
(data,opened)=self._fetchUrlRawOpened(url,
parameters=parameters,
usecache=usecache,
extrasleep=extrasleep,
referer=referer)
return (self._do_reduce_zalgo(self._decode(data)),opened)
except HTTPError as he:
excpt=he
if he.code in (403,404,410):
logger.debug("Caught an exception reading URL: %s Exception %s."%(unicode(safe_url(url)),unicode(he)))
break # break out on 404
except Exception as e:
excpt=e
logger.debug("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e)))
raise
logger.debug("Giving up on %s" %safe_url(url))
logger.debug(excpt, exc_info=True)
raise(excpt)
def setMetadata(self, key, value, condremoveentities=True):
# delete cached replace'd value.
if key in self.processed_metadata_cache:
del self.processed_metadata_cache[key]
# Fixing everything downstream to handle bool primatives is a
# pain.
if isinstance(value,bool):
value = unicode(value)
# keep as list type, but set as only value.
if self.isList(key):
self.addToList(key,value,condremoveentities=condremoveentities,clear=True)
else:
## still keeps < < and &
if condremoveentities:
self.metadata[key]=conditionalRemoveEntities(value)
else:
self.metadata[key]=value
if key == "language":
try:
# getMetadata not just self.metadata[] to do replace_metadata.
self.setMetadata('langcode',langs[self.getMetadata(key)])
except:
self.setMetadata('langcode','en')
def make_soup(self,data):
'''
Convenience method for getting a bs4 soup. bs3 has been removed.
'''
## html5lib handles
sleeptimes = [0]
else:
sleeptimes = [0, 0.5, 4, 9]
for sleeptime in sleeptimes:
time.sleep(sleeptime)
try:
(data,opened)=self._fetchUrlRawOpened(url,
parameters=parameters,
usecache=usecache,
extrasleep=extrasleep,
referer=referer)
return (self._do_reduce_zalgo(self._decode(data)),opened)
except HTTPError as he:
excpt=he
if he.code in (403,404,410):
logger.debug("Caught an exception reading URL: %s Exception %s."%(unicode(safe_url(url)),unicode(he)))
break # break out on 404
except Exception as e:
excpt=e
logger.debug("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e)))
raise
logger.debug("Giving up on %s" %safe_url(url))
logger.debug(excpt, exc_info=True)
raise(excpt)
if chaptertitle and chapterorigtitle and chapterorigtitle != chaptertitle:
origdata = data
# data = data.replace(u'',
# u'')
# data = data.replace(u'<title>'+chaptertitle+u'</title>',u'<title>'+chapterorigtitle+u'</title>')
# data = data.replace(u'<h3>'+chaptertitle+u'</h3>',u'<h3>'+chapterorigtitle+u'</h3>')
chaptertitle_tag['content'] = chapterorigtitle
title_tag = soup.find('title')
if title_tag and title_tag.string == chaptertitle:
title_tag.string.replace_with(chapterorigtitle)
h3_tag = soup.find('h3')
if h3_tag and h3_tag.string == chaptertitle:
h3_tag.string.replace_with(chapterorigtitle)
data = unicode(soup)
entrychanged = ( origdata != data )
changed = changed or entrychanged
if entrychanged:
logger.debug("\nentrychanged:%s\n"%zf)
_replace_tocncx(tocncxdom,zf,chaptertoctitle)
## Also look for and update individual
## book toc.ncx files for anthology in case
## it's unmerged.
zf_toc = zf[:zf.rfind('/OEBPS/')]+'/toc.ncx'
mergedprefix_len = len(zf[:zf.rfind('/OEBPS/')])+1
if zf_toc in unmerge_tocncxdoms:
_replace_tocncx(unmerge_tocncxdoms[zf_toc],zf[mergedprefix_len:],chaptertoctitle)
ENTRY = self.EPUB_LOG_ENTRY
if self.hasConfig("logpage_update_end"):
END = string.Template(self.getConfig("logpage_update_end"))
else:
END = self.EPUB_LOG_UPDATE_END
retval = START.substitute(self.story.getAllMetadata())
## words_added is only used in logpage because it's the only
## place we know the previous version's word count.
if 'words_added' in (self.getConfigList("logpage_entries") + self.getConfigList("extra_logpage_entries")):
new_words = self.story.getMetadata('numWords')
old_words = oldvalues.get('numWords',None)
if new_words and old_words:
self.story.setMetadata('words_added',commaGroups(unicode(int(new_words.replace(',',''))-int(old_words.replace(',','')))))
for entry in self.getConfigList("logpage_entries") + self.getConfigList("extra_logpage_entries"):
if self.isValidMetaEntry(entry):
val = self.story.getMetadata(entry)
if val and ( entry not in oldvalues or val != oldvalues[entry] ):
label=self.get_label(entry)
# if self.hasConfig(entry+"_label"):
# label=self.getConfig(entry+"_label")
# elif entry in self.titleLabels:
# logger.debug("Using fallback label for %s_label"%entry)
# label=self.titleLabels[entry]
# else:
# label="%s"%entry.title()
# logger.debug("No known label for %s, fallback to '%s'"%(entry,label))
retval = retval + ENTRY.substitute({'id':entry,
def removeEntities(text, space_only=False, remove_all_entities=False):
# keeps &, < and > when remove_all_entities=False
if text is None:
return u""
if not isinstance(text,basestring):
text = unicode(text)
try:
t = text
except (UnicodeEncodeError,UnicodeDecodeError) as e:
try:
t = text.encode ('ascii', 'xmlcharrefreplace')
except (UnicodeEncodeError,UnicodeDecodeError) as e:
t = text
text = t
# replace numeric versions of [&<>] with named versions,
# then replace named versions with actual characters,
text = re.sub(r'�*38;','&',text)
text = re.sub(r'�*60;','<',text)
text = re.sub(r'�*62;','>',text)
# replace remaining � entities with unicode value, such as ' -> '