How to use the bs4.SoupStrainer function in bs4

To help you get started, we’ve selected a few bs4 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github carpedm20 / fbchat / fbchat / _session.py View on Github external
def get_error_data(html: str) -> Optional[str]:
    """Get error message from a request."""
    soup = bs4.BeautifulSoup(
        html, "html.parser", parse_only=bs4.SoupStrainer("form", id="login_form")
    )
    # Attempt to extract and format the error string
    return " ".join(list(soup.stripped_strings)[1:3]) or None
github knight42 / LeetCodeSpider / crawler.py View on Github external
def get_submissions(self, specified_langs):
        submurl = 'https://leetcode.com/submissions/'
        strainer = SoupStrainer('tbody')
        memory = defaultdict(dict)
        for i in itertools.count(1):
            url = urljoin(submurl, str(i))
            soup = self.get_soup(url, strainer)
            rowlist = soup.find_all('tr')
            if rowlist == []:
                break
            eachpage = defaultdict(dict)
            for row in rowlist:
                _, title, status, _, lang = list(row.stripped_strings)
                if status == 'Accepted':
                    title = title.replace(' ', '_')
                    if not memory[title].get(lang):
                        memory[title][lang] = urljoin(self.BASEURL, row.find_all('a')[1]['href'])
                        eachpage[title][lang] = memory[title][lang]
            info = []
github agrawalsmart7 / autoRecon / Phase-list / subdomainenum.py View on Github external
def httpurlstates(y, req):	
	try:
		
		
		wadresults.setdefault(y, [])
	
		parse = BeautifulSoup(req.content, 'html.parser', parse_only=SoupStrainer('meta'))
		
		for link in parse:
			if link.has_attr('name'):
				if 'generator' in link['name']:
					wadresults[y].append(link['content'])
					
				else:
					None
					
		for x in headers:
			if x in req.headers:
				value = req.headers.get(x)
				wadresults[y].append(value)
				
			else:
				wadresults[y].append('')
github mosbth / irc2phpbb / old / phpmanual.py View on Github external
try:
        #print('Start to read') DEBUG
        siteData = urllib2.urlopen(url)
        #print('Done reading.') DEBUG
    except urllib2.HTTPError, e:
        print(e.code)
    except urllib2.URLError, e:
        print(e.args)

    # This is the default value that will be returned if nothing is found.
    result = 'Found nothing.'

    # Actually parse and find the text 
    if siteData is not None:
        # Use SoupStrainer to only parse what I need
        tagsWithClass = SoupStrainer('p',{'class': 'refpurpose'})

        #print('Done creating SoupStrainer.') DEBUG

        # Create the soup object, using the SoupStrainer.
        # This is what takes the most time (hence the .txt-file cache)
        soup = BeautifulSoup(siteData, "lxml",  parse_only=tagsWithClass)

        #print('Done creating BeautifulSoup.') DEBUG

        # Get the specific tag I need
        shortDescrPtag = soup.find("p", { "class" : "refpurpose" })

        #print('Done finding tag.') DEBUG
        try:
            # Put the text without html tags in my fancy string
            result = 'PHP-manualen: ' + shortDescrPtag.get_text() + ' - ' + url
github alfiepoleon / kenya-news-scrapper / business / business_web.py View on Github external
def get_standard():
    standard_url = 'https://www.standardmedia.co.ke/business/category/19/business-news'
    if check_connection(standard_url):
        standard = requests.get(standard_url)
        soup = BeautifulSoup(standard.text, 'lxml', parse_only=SoupStrainer('div'))
        standard = []
        for link in soup.select('h4 a', limit=14):
            if link.get_text():
                news_title = '{}({})'.format(link.get_text().strip(), link.get('href'))
                standard_link = requests.get(link.get('href'))
                soup_link = BeautifulSoup(standard_link.text, 'lxml', parse_only=SoupStrainer(['script']))
                article_date = 0
                content = ''
                image = ''
                try:
                    data = json.loads(soup_link.find('script', type='application/ld+json').text.replace("\\", r"\\"))
                    article_date = data['dateModified']
                    content = data['description']
                    image = data['image']['url']
                    if image == 'https://www.standardmedia.co.ke':
                        image = ''
                    print(image)
                except ValueError:
                    print('Standard: invalid json detected')
                    continue

                news_dict = {
github konlpy / konlpy / konlpy / stream / dcinside.py View on Github external
action='store_true',
        )
        parser.add_argument(
            '--filename',
            help="filename to be saved.",
            default="gallery.txt"
        )

        self.options, _ = parser.parse_known_args()
        self._session = requests.Session()
        self._markup = markup
        self._view_url = 'http://gall.dcinside.com/board/view'
        self._comment_view_url = 'http://gall.dcinside.com/board/view'
        self._current_post_id = self.options.init_post_id

        self._strainer = SoupStrainer('div', attrs={'class': [
            're_gall_top_1',    # 제목, 글쓴이, 작성시각
            'btn_recommend',    # 추천, 비추천
            'gallery_re_title',  # 댓글
            's_write',          # 본문
        ]})
        # Custom header is required in order to request.
        self.header = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                       'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'}
github evansloan / sports.py / sports / teams.py View on Github external
def _get_team_links(base_url, table_id):
    links = SoupStrainer('table', {'id': table_id})
    return BeautifulSoup(requests.get(base_url).content, 'html.parser', parse_only=links)