Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def extractChapterUrlsAndMetadata(self):
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
# Except it doesn't this time. :-/
url = self.url #+'&index=1'+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except HTTPError as e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
# use BeautifulSoup HTML parser to make everything easier to find.
soup = self.make_soup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
if a is None:
raise exceptions.StoryDoesNotExist(self.url)
self.story.setMetadata('authorId',a['href'].split('=')[1])
canonlink = soup.find('link',rel='canonical')
if canonlink:
# logger.debug(canonlink)
canonlink = re.sub(r"/chapters/\d+","",canonlink['href'])
# logger.debug(canonlink)
self._setURL(canonlink)
url = self.url
data = self._fetchUrl(url)
soup = self.make_soup(data)
else:
# in case title changed
self._setURL(soup.select_one("div.Story__details a")['href'])
url = self.url
except HTTPError as e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
self.story.setMetadata('title',stripHTML(soup.find('h1',{'class':'Story__title'})))
summhead = soup.find('h5',text='Summary')
self.setDescription(url,summhead.find_next('div'))
## author:
autha = soup.find('div',{'class':'StoryContents__meta'}).find('a') # first a in StoryContents__meta
self.story.setMetadata('authorId',autha['href'].split('/')[4])
self.story.setMetadata('authorUrl',autha['href'])
self.story.setMetadata('author',autha.string)
def doExtractChapterUrlsAndMetadata(self, get_cover=True):
## You need to have your is_adult set to true to get this story
if not (self.is_adult or self.getConfig("is_adult")):
raise exceptions.AdultCheckRequired(self.url)
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except HTTPError as e:
if e.code == 404:
raise exceptions.StoryDoesNotExist("Code: 404. {0}".format(url))
elif e.code == 410:
raise exceptions.StoryDoesNotExist("Code: 410. {0}".format(url))
elif e.code == 401:
self.needToLogin = True
data = ''
else:
raise e
if "The dragons running the back end of the site can not seem to find the story you are looking for." in data:
raise exceptions.StoryDoesNotExist("{0}.{1} says: The dragons running the back end of the site can not seem to find the story you are looking for.".format(self.zone, self.getBaseDomain()))
# use BeautifulSoup HTML parser to make everything easier to find.
soup = self.make_soup(data)
def extractChapterUrlsAndMetadata(self):
url = self.url
logger.debug("URL: "+url)
## You need to have your is_adult set to true to get this story
if not (self.is_adult or self.getConfig("is_adult")):
raise exceptions.AdultCheckRequired(self.url)
else:
url = url+"?enterAgree=1"
try:
data = self._fetchUrl(url)
except HTTPError as e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
soup = self.make_soup(data)
## Title
h1 = soup.find('h1', class_='titleSemantic')
self.story.setMetadata('title',stripHTML(h1))
storyInfo = h1.find_next('td', class_='storyInfo')
storyDescript = h1.find_next('td', class_='storyDescript')
# Find authorid and URL from... author url.
a = soup.find('span',string='Author').find_next('a')
def extractChapterUrlsAndMetadata(self):
url=self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except HTTPError as e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
# use BeautifulSoup HTML parser to make everything easier to find.
soup = self.make_soup(data)
title = soup.find('h1')
for tag in title.findAll('sup'):
tag.extract()
self.story.setMetadata('title', stripHTML(title.text))
logger.debug("Title: (%s)"%self.story.getMetadata('title'))
author_title = soup.find('strong', text = re.compile(u"Автор: "))
def doExtractChapterUrlsAndMetadata(self, get_cover=True):
url = self.url
try:
data = self._fetchUrl(url)
except HTTPError as e:
if e.code == 404:
raise exceptions.StoryDoesNotExist('Error 404: {0}'.format(self.url))
else:
raise e
if 'ERROR: Invalid submission_id or no submission_id requested.' in data:
raise exceptions.StoryDoesNotExist('{0} says: "ERROR: Invalid submission_id or no submission_id requested." for url "{1}"'.format(self.getSiteDomain(), self.url))
# use BeautifulSoup HTML parser to make everything easier to find.
soup = self.make_soup(data)
## To view content, we need to login
if 'Submission blocked' in data:
if self.performLogin(url,soup): # performLogin raises
# FailedToLogin if it fails.
soup = self.make_soup(self._fetchUrl(url,usecache=False))
logger.debug("URL: "+url)
params={}
if self.password:
params['username'] = self.username
params['password'] = self.password
else:
params['username'] = self.getConfig("username")
params['password'] = self.getConfig("password")
if not params['username']:
raise exceptions.FailedToLogin('You need to have your username and password set.',params['username'])
try:
data = self._fetchUrl(url+'index/', params, usecache=False)
except HTTPError as e:
if e.code == 404:
raise exceptions.StoryDoesNotExist("Code: 404. {0}".format(url))
elif e.code == 410:
raise exceptions.StoryDoesNotExist("Code: 410. {0}".format(url))
elif e.code == 401:
self.needToLogin = True
data = ''
else:
raise e
if "The requested file has not been found" in data:
raise exceptions.StoryDoesNotExist(
"{0}.{1} says: The requested file has not been found".format(
self.zone, self.getBaseDomain()))
# use BeautifulSoup HTML parser to make everything easier to find.
def get_page(self, page):
'''
This will download the url from the web and return the data
I'm using it since I call several pages below, and this will cut down
on the size of the file
'''
try:
page_data = self._fetchUrl(page)
except HTTPError as e:
if e.code == 404:
raise exceptions.StoryDoesNotExist('404 error: {}'.format(page))
else:
raise e
return page_data
def extractChapterUrlsAndMetadata(self):
url = self.url
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url) # w/o trailing / gets 'chapter list' page even for one-shots.
except HTTPError as e:
if e.code == 404:
logger.error("404 on %s"%url)
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
# use BeautifulSoup HTML parser to make everything easier to find.
soup = self.make_soup(data)
## title:
## <h1 id="post-title">A, A' Fan Fiction ❯ Mmmmm</h1>
titletext = unicode(stripHTML(soup.find("h1",{"id":"post-title"})))
titletext = titletext[titletext.index(u'❯')+2:]
# print("title:(%s)"%titletext)
self.story.setMetadata('title',titletext)
def _customized_fetch_url(self, url, exception=None, parameters=None):
if exception:
try:
data = self._fetchUrl(url, parameters)
except HTTPError:
raise exception(self.url)
# Just let self._fetchUrl throw the exception, don't catch and
# customize it.
else:
data = self._fetchUrl(url, parameters)
return self.make_soup(data)