How to use the bs4.BeautifulSoup function in bs4

To help you get started, we’ve selected a few bs4 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github rcarmo / bottle-fever / controllers / index.py View on Github external
def add_item(self, guid):
        try:
            item = Item.get(guid = guid)
        except Item.DoesNotExist:
            return False


        if len(item.html):
            soup = BeautifulSoup(item.html, self.settings.fetcher.parser)
            plaintext = ''.join(soup.find_all(text=True))
            self.writer.add_document(
                id    = item.id,
                guid  = unicode(item.guid),
                title = item.title,
                text  = plaintext,
                when  = datetime.datetime.utcfromtimestamp(item.when)
            )
            return True
        return False
github Pelhans / Z_knowledge_graph / ie / craw / craw_all_baidu / baidu_baike / spiders / baidu_baike-5.py View on Github external
inter_links_dict = {}
        soup = BeautifulSoup(response.text, 'lxml')
        inter_links = soup.find_all('a', href=re.compile(r"/item/"))
        for link in inter_links:
            new_url = link["href"]
            url_name = link.get_text()
            new_full_url = urlparse.urljoin('https://baike.baidu.com/', new_url)
            inter_links_dict[url_name] = new_full_url
        try:
            item['interLink'] = json.dumps(inter_links_dict)
        except:
            item['interLink'] = None
        
        exter_links_dict = {}
        soup = BeautifulSoup(response.text, 'lxml')
        exterLink_links = soup.find_all('a', href=re.compile(r"/redirect/"))
        for link in exterLink_links:
            new_url = link["href"]
            url_name = link.get_text()
            new_full_url = urlparse.urljoin('https://baike.baidu.com/', new_url)
            exter_links_dict[url_name] = new_full_url
        try:
            item['exterLink'] = json.dumps(exter_links_dict)
        except:
            item['exterLink'] = None

        all_para = soup.find_all('div',class_="para")
        all_text = [para.get_text() for para in all_para]
        try:
            item['all_text'] = ' '.join(all_text)
        except:
github cykl / infoqscraper / infoqscraper / scrap.py View on Github external
def soup(self):
        """Download the page and create the soup"""
        try:
            return self._soup
        except AttributeError:
            url = client.get_url("/presentations/%s" % self.index)
            content = self.client.fetch_no_cache(url).decode('utf-8')
            self._soup = bs4.BeautifulSoup(content, "html.parser")

            return self._soup
github GINK03 / scraping-designs / doujin-eromanga-com / scrape.py View on Github external
save_name = 'htmls/' + hashlib.sha256(bytes(url,'utf8')).hexdigest()
    save_href = 'hrefs/' + hashlib.sha256(bytes(url,'utf8')).hexdigest()
    if os.path.exists(save_name) is True:
      return []
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
    try:
      r = requests.get(url, headers=headers)
    except Exception as e:
      return []
    r.encoding = 'UTF-8'#r.apparent_encoding
    html = r.text
    try:
      open(save_name, 'wb').write( gzip.compress(bytes(html,'utf8')) )
    except OSError:
      return []
    soup = bs4.BeautifulSoup(html)
   
    hrefs = []
    for href in soup.find_all('a', href=True): 
      _url = href['href']
      try:
        if '/' == _url[0]:
          _url = URL + _url
      except IndexError as e:
        continue
      if re.search(r'^' + URL, _url) is None: 
        continue
      #_url = re.sub(r'\?.*?$', '', _url)
      hrefs.append(_url)
    open(save_href, 'w').write( json.dumps(hrefs) )
    return [href for href in hrefs if os.path.exists('htmls/' + hashlib.sha256(bytes(href,'utf8')).hexdigest()) == False] 
  except Exception as ex:
github pystockhub / book / ch19 / day06 / 03.py View on Github external
from bs4 import BeautifulSoup

html = '''
<ul>
	<li> 100 </li>
	<li> 200 </li>
</ul>
<ol>
	<li> 300 </li>
	<li> 400 </li>
</ol>
'''
soup = BeautifulSoup(html, 'html5lib')    
result = soup.select('ul li')
print(result)
github universitas / universitas.no / django / apps / stories / models / mixins.py View on Github external
def get_plaintext(self):
        """ Returns text content as plain text. """
        soup = BeautifulSoup(self.get_html(), 'html5lib')
        return soup.get_text()
github intfloat / sina-weibo-crawler / wcrawler.py View on Github external
# step 2: crawl user's personal information
        req = self.__get_request(info_url)
        soup = BeautifulSoup(req.text, 'lxml')
        self.__parse_info_list(soup)

        # step 3: crawl user's fans list
        if self.max_num_fans > 0 and self.data['num_fans'] > 0:
            req = self.__get_request(fans_url)
            soup = BeautifulSoup(req.text, 'lxml')
            fans_max_page = self.__parse_max_pages(soup)
            self.data['fans'] += self.__parse_fans(soup)
            for i in xrange(2, 1 + fans_max_page):
                if i > self.max_num_page or len(self.data['fans']) >= self.max_num_fans:
                    break
                req = self.__get_request(fans_url + '?page=' + str(i))
                soup = BeautifulSoup(req.text, 'lxml')
                self.data['fans'] += self.__parse_fans(soup)
            self.data['fans'] = self.data['fans'][:self.max_num_fans]

        # step 4: crawl user's follow list
        if self.max_num_follow > 0 and self.data['num_follow'] > 0:
            req = self.__get_request(follow_url)
            soup = BeautifulSoup(req.text, 'lxml')
            follow_max_page = self.__parse_max_pages(soup)
            self.data['follow'] += self.__parse_follow(soup)
            for i in xrange(2, 1 + follow_max_page):
                if i > self.max_num_page or len(self.data['follow']) >= self.max_num_follow:
                    break
                req = self.__get_request(follow_url + '?page=' + str(i))
                soup = BeautifulSoup(req.text, 'lxml')
                self.data['follow'] += self.__parse_follow(soup)
            self.data['follow'] = self.data['follow'][:self.max_num_follow]
github CLARIAH / COW / src / old / nappvocab_converter / nappvocab.py View on Github external
def makesoup(url):
    html = get(url).text
    soup = BeautifulSoup(html, 'html5lib')
    return(soup)
github palexu / send2kindle / zhihu2kindle.py View on Github external
def getAnswer(answerID):
	host="http://www.zhihu.com"
	url=host+answerID
	print url
	user_agent="Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
	#构造header 伪装一下
	header={"User-Agent":user_agent}
	req=urllib2.Request(url,headers=header)
	resp=urllib2.urlopen(req)
	
	#这里已经获取了 网页的代码,接下来就是提取你想要的内容。 使用beautifulSoup 来处理,很方便
	
	bs=BeautifulSoup(resp,"html.parser")
	title=bs.title
	#获取的标题

	filename_old=title.string.strip()
	print filename_old.encode("utf-8")
	filename = re.sub('[\/:*?"&lt;&gt;|]','-',filename_old)
	#用来保存内容的文件名,因为文件名不能有一些特殊符号,所以使用正则表达式过滤掉
	
	save2file(filename,title.string)
	title_content=title.string
	
	answer=[]
	
	detail=bs.find("div",class_="zm-editable-content")
	user_ids=bs.find_all("a",class_="author-link")
github autostack-team / autostack / autostack / so_web_scraper / __init__.py View on Github external
Parameter {bs4.Tag} post_summary: the bs4.Tag post summary.
    Parameter {bs4.BeautifulSoup}: the BeautifulSoup of the post,
    if it has an accepted answer; otherwise, None.
    '''

    if has_accepted_answer(post_summary):
        post_url = get_post_url(post_summary)

        try:
            response = requests.get(BASE_URL + post_url)
            response.raise_for_status()
        except requests.exceptions.HTTPError:
            return None

        return BeautifulSoup(response.text, 'lxml')

    return None