How to use the bs4.element function in bs4

To help you get started, we’ve selected a few bs4 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

voidcase / txtv / txtv / listing.py View on Github

def parse_content_listing(page: Page) -> list:
    raw = ''
    for n in page.subpages[0].children:
        if isinstance(n, str):
            raw += n
            pass
        elif isinstance(n, bs4.element.Tag):
            if 'class' not in n.attrs or all((x not in n.attrs['class']) for x in ['bgB', 'bgY', 'Y']):
                raw += n.get_text()
    entries = raw.splitlines()
    entries = [e for e in entries if not re.fullmatch(' *', e)]
    entries = [parse_content_entry(e) for e in entries]
    return entries

thoppe / miniprez / miniprez / parser.py View on Github

meta.extract()

        # Handle the title differently
        if meta["name"] == "Title":
            title = soup.new_tag("title")
            title.insert(0, meta["content"])
            soup.head.insert(0, title)
        else:
            soup.head.insert(0, meta)

    # Add the HTML doctype
    soup.insert(0, bs4.element.Doctype("HTML"))

    # Unwrap all useless p tags
    for ele in soup.find_all("p"):
        if isinstance(ele, bs4.element.Tag):
            if not ele.get_text().strip():
                ele.unwrap()

    # Wrap the parent element codes if needed
    for ele in soup.find_all("span"):
        if "class" not in ele.attrs:
            continue

        if "inline-equation" in ele["class"]:
            continue
        if "block-equation" in ele["class"]:
            continue

        parent = ele.parent
        if parent.name not in ["h1", "h2", "h3", "h4", "h5", "h6"]:
            continue

btimby / fulltext / fulltext / backends / __xml.py View on Github

def is_visible(self, elem):
        if isinstance(elem, (bs4.element.ProcessingInstruction,
                             bs4.element.Doctype)):
            return False

        if not PY3:
            elem = elem.encode(self.encoding, self.encoding_errors)
        if re.match('', elem):
            return False

        return True

svalgaard / fskintra / skoleintra / sbs4.py View on Github

def trimSoup(bs):
    '''Trim "body" of bs for whitespace including <br>'''
    for rev in [False, True]:
        children = list(bs.children)
        if rev:
            children = reversed(children)
        for c in children:
            if isinstance(c, bs4.element.Tag):
                if c.name == 'br':
                    c.extract()
                    continue
            if isinstance(c, bs4.element.NavigableString):
                text = c.string
                text = text.rstrip() if rev else text.lstrip()
                if not text:
                    c.extract()
                    continue
                c.string.replace_with(text)
            break

myd7349 / Ongoing-Study / python / douban_movie_top250 / douban_movie_top250_v0.py View on Github

with urllib.request.urlopen(start_url) as response:
        start_page = bs4.BeautifulSoup(response, features) # Use lxml's HTML parser if possible
        paginator = start_page.find('div', 'paginator')
        for page_href in paginator.find_all('a'):
            url = start_url + page_href.get('href')
            if url not in urls:
                urls.append(url)

    with tqdm.tqdm(total=250) as pbar:
        for url in urls:
            with urllib.request.urlopen(url) as response:
                soup = bs4.BeautifulSoup(response, features)
                movies_list = soup.find(id = 'content').ol
                for movie_item in movies_list:
                    if isinstance(movie_item, bs4.element.Tag):
                        movie_no = int(movie_item.em.string)
                        movie_title = movie_item.find('span', 'title').string
                        print('{:03d}: {}'.format(movie_no, movie_title), file=file)
                        pbar.update(1)

snarfed / indie-map / src / sites_to_bigquery.py View on Github

def add(self, item):
        if isinstance(item, bs4.element.Tag):
            item = item.string
        if item:
            self[item] = None

MechanicalSoup / MechanicalSoup / mechanicalsoup / stateful_browser.py View on Github

:param selector: CSS selector or a bs4.element.Tag object to identify
            the form to select.
            If not specified, ``selector`` defaults to "form", which is
            useful if, e.g., there is only one form on the page.
            For ``selector`` syntax, see the `.select() method in BeautifulSoup
            `__.
        :param nr: A zero-based index specifying which form among those that
            match ``selector`` will be selected. Useful when one or more forms
            have the same attributes as the form you want to select, and its
            position on the page is the only way to uniquely identify it.
            Default is the first matching form (``nr=0``).

        :return: The selected form as a soup object. It can also be
            retrieved later with the :attr:`form` attribute.
        """
        if isinstance(selector, bs4.element.Tag):
            if selector.name != "form":
                raise LinkNotFoundError
            self.__state.form = Form(selector)
        else:
            # nr is a 0-based index for consistency with mechanize
            found_forms = self.page.select(selector,
                                           limit=nr + 1)
            if len(found_forms) != nr + 1:
                if self.__debug:
                    print('select_form failed for', selector)
                    self.launch_browser()
                raise LinkNotFoundError()
            self.__state.form = Form(found_forms[-1])

        return self.form

IFGHou / wapiti / wapitiCore / attack / mod_xss.py View on Github

elif keyword in bs_node.name:
                    # print("Found in tag name")
                    noscript = self.closeNoscript(bs_node)
                    d = {"type": "tag", "value": bs_node.name, "noscript": noscript}
                    if d not in entries:
                        entries.append(d)
                # recursively search injection points for the same variable
                for x in bs_node.contents:
                    self.study(x, parent=bs_node, keyword=keyword, entries=entries)
            elif isinstance(bs_node, element.Comment):
                # print("Found in comment, tag {0}".format(parent.name))
                noscript = self.closeNoscript(bs_node)
                d = {"type": "comment", "parent": parent.name, "noscript": noscript}
                if d not in entries:
                    entries.append(d)
            elif isinstance(bs_node, element.NavigableString):
                # print("Found in text, tag {0}".format(parent.name))
                noscript = self.closeNoscript(bs_node)
                d = {"type": "text", "parent": parent.name, "noscript": noscript}
                if d not in entries:
                    entries.append(d)

mozilla / OpenWPM / automation / Commands / utils / XPathUtil.py View on Github

def ExtractXPath(element, use_id=True):
    # Check that element is a tag node
    if type(element) != bs4.element.Tag:
        raise ExtractXPathError(
            '%s is not a supported data type. '
            'Only tag nodes from the tag tree are accepted.'
            % type(element)
        )

    # Starting node
    # Check id first
    if use_id and element.get('id') is not None:
        return '//*/' + element.name + '[@id="' + element.get('id') + '"]'

    xpath = check_previous_tags(element)

    # Parent Nodes
    for parent in element.parents:
        # End of XPath - exclude from string

How to use the bs4.element function in bs4

To help you get started, we’ve selected a few bs4 examples, based on popular ways it is used in public projects.

bs4

Package Health Score

Popular bs4 functions

Similar packages