Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def add_item(self, guid):
try:
item = Item.get(guid = guid)
except Item.DoesNotExist:
return False
if len(item.html):
soup = BeautifulSoup(item.html, self.settings.fetcher.parser)
plaintext = ''.join(soup.find_all(text=True))
self.writer.add_document(
id = item.id,
guid = unicode(item.guid),
title = item.title,
text = plaintext,
when = datetime.datetime.utcfromtimestamp(item.when)
)
return True
return False
inter_links_dict = {}
soup = BeautifulSoup(response.text, 'lxml')
inter_links = soup.find_all('a', href=re.compile(r"/item/"))
for link in inter_links:
new_url = link["href"]
url_name = link.get_text()
new_full_url = urlparse.urljoin('https://baike.baidu.com/', new_url)
inter_links_dict[url_name] = new_full_url
try:
item['interLink'] = json.dumps(inter_links_dict)
except:
item['interLink'] = None
exter_links_dict = {}
soup = BeautifulSoup(response.text, 'lxml')
exterLink_links = soup.find_all('a', href=re.compile(r"/redirect/"))
for link in exterLink_links:
new_url = link["href"]
url_name = link.get_text()
new_full_url = urlparse.urljoin('https://baike.baidu.com/', new_url)
exter_links_dict[url_name] = new_full_url
try:
item['exterLink'] = json.dumps(exter_links_dict)
except:
item['exterLink'] = None
all_para = soup.find_all('div',class_="para")
all_text = [para.get_text() for para in all_para]
try:
item['all_text'] = ' '.join(all_text)
except:
def soup(self):
"""Download the page and create the soup"""
try:
return self._soup
except AttributeError:
url = client.get_url("/presentations/%s" % self.index)
content = self.client.fetch_no_cache(url).decode('utf-8')
self._soup = bs4.BeautifulSoup(content, "html.parser")
return self._soup
save_name = 'htmls/' + hashlib.sha256(bytes(url,'utf8')).hexdigest()
save_href = 'hrefs/' + hashlib.sha256(bytes(url,'utf8')).hexdigest()
if os.path.exists(save_name) is True:
return []
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
try:
r = requests.get(url, headers=headers)
except Exception as e:
return []
r.encoding = 'UTF-8'#r.apparent_encoding
html = r.text
try:
open(save_name, 'wb').write( gzip.compress(bytes(html,'utf8')) )
except OSError:
return []
soup = bs4.BeautifulSoup(html)
hrefs = []
for href in soup.find_all('a', href=True):
_url = href['href']
try:
if '/' == _url[0]:
_url = URL + _url
except IndexError as e:
continue
if re.search(r'^' + URL, _url) is None:
continue
#_url = re.sub(r'\?.*?$', '', _url)
hrefs.append(_url)
open(save_href, 'w').write( json.dumps(hrefs) )
return [href for href in hrefs if os.path.exists('htmls/' + hashlib.sha256(bytes(href,'utf8')).hexdigest()) == False]
except Exception as ex:
from bs4 import BeautifulSoup
html = '''
<ul>
<li> 100 </li>
<li> 200 </li>
</ul>
<ol>
<li> 300 </li>
<li> 400 </li>
</ol>
'''
soup = BeautifulSoup(html, 'html5lib')
result = soup.select('ul li')
print(result)
def get_plaintext(self):
""" Returns text content as plain text. """
soup = BeautifulSoup(self.get_html(), 'html5lib')
return soup.get_text()
# step 2: crawl user's personal information
req = self.__get_request(info_url)
soup = BeautifulSoup(req.text, 'lxml')
self.__parse_info_list(soup)
# step 3: crawl user's fans list
if self.max_num_fans > 0 and self.data['num_fans'] > 0:
req = self.__get_request(fans_url)
soup = BeautifulSoup(req.text, 'lxml')
fans_max_page = self.__parse_max_pages(soup)
self.data['fans'] += self.__parse_fans(soup)
for i in xrange(2, 1 + fans_max_page):
if i > self.max_num_page or len(self.data['fans']) >= self.max_num_fans:
break
req = self.__get_request(fans_url + '?page=' + str(i))
soup = BeautifulSoup(req.text, 'lxml')
self.data['fans'] += self.__parse_fans(soup)
self.data['fans'] = self.data['fans'][:self.max_num_fans]
# step 4: crawl user's follow list
if self.max_num_follow > 0 and self.data['num_follow'] > 0:
req = self.__get_request(follow_url)
soup = BeautifulSoup(req.text, 'lxml')
follow_max_page = self.__parse_max_pages(soup)
self.data['follow'] += self.__parse_follow(soup)
for i in xrange(2, 1 + follow_max_page):
if i > self.max_num_page or len(self.data['follow']) >= self.max_num_follow:
break
req = self.__get_request(follow_url + '?page=' + str(i))
soup = BeautifulSoup(req.text, 'lxml')
self.data['follow'] += self.__parse_follow(soup)
self.data['follow'] = self.data['follow'][:self.max_num_follow]
def makesoup(url):
html = get(url).text
soup = BeautifulSoup(html, 'html5lib')
return(soup)
def getAnswer(answerID):
host="http://www.zhihu.com"
url=host+answerID
print url
user_agent="Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
#构造header 伪装一下
header={"User-Agent":user_agent}
req=urllib2.Request(url,headers=header)
resp=urllib2.urlopen(req)
#这里已经获取了 网页的代码,接下来就是提取你想要的内容。 使用beautifulSoup 来处理,很方便
bs=BeautifulSoup(resp,"html.parser")
title=bs.title
#获取的标题
filename_old=title.string.strip()
print filename_old.encode("utf-8")
filename = re.sub('[\/:*?"<>|]','-',filename_old)
#用来保存内容的文件名,因为文件名不能有一些特殊符号,所以使用正则表达式过滤掉
save2file(filename,title.string)
title_content=title.string
answer=[]
detail=bs.find("div",class_="zm-editable-content")
user_ids=bs.find_all("a",class_="author-link")
Parameter {bs4.Tag} post_summary: the bs4.Tag post summary.
Parameter {bs4.BeautifulSoup}: the BeautifulSoup of the post,
if it has an accepted answer; otherwise, None.
'''
if has_accepted_answer(post_summary):
post_url = get_post_url(post_summary)
try:
response = requests.get(BASE_URL + post_url)
response.raise_for_status()
except requests.exceptions.HTTPError:
return None
return BeautifulSoup(response.text, 'lxml')
return None