How to use the bs4.element function in bs4

To help you get started, we’ve selected a few bs4 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github voidcase / txtv / txtv / listing.py View on Github external
def parse_content_listing(page: Page) -> list:
    raw = ''
    for n in page.subpages[0].children:
        if isinstance(n, str):
            raw += n
            pass
        elif isinstance(n, bs4.element.Tag):
            if 'class' not in n.attrs or all((x not in n.attrs['class']) for x in ['bgB', 'bgY', 'Y']):
                raw += n.get_text()
    entries = raw.splitlines()
    entries = [e for e in entries if not re.fullmatch(' *', e)]
    entries = [parse_content_entry(e) for e in entries]
    return entries
github thoppe / miniprez / miniprez / parser.py View on Github external
meta.extract()

        # Handle the title differently
        if meta["name"] == "Title":
            title = soup.new_tag("title")
            title.insert(0, meta["content"])
            soup.head.insert(0, title)
        else:
            soup.head.insert(0, meta)

    # Add the HTML doctype
    soup.insert(0, bs4.element.Doctype("HTML"))

    # Unwrap all useless p tags
    for ele in soup.find_all("p"):
        if isinstance(ele, bs4.element.Tag):
            if not ele.get_text().strip():
                ele.unwrap()

    # Wrap the parent element codes if needed
    for ele in soup.find_all("span"):
        if "class" not in ele.attrs:
            continue

        if "inline-equation" in ele["class"]:
            continue
        if "block-equation" in ele["class"]:
            continue

        parent = ele.parent
        if parent.name not in ["h1", "h2", "h3", "h4", "h5", "h6"]:
            continue
github btimby / fulltext / fulltext / backends / __xml.py View on Github external
def is_visible(self, elem):
        if isinstance(elem, (bs4.element.ProcessingInstruction,
                             bs4.element.Doctype)):
            return False

        if not PY3:
            elem = elem.encode(self.encoding, self.encoding_errors)
        if re.match('', elem):
            return False

        return True
github svalgaard / fskintra / skoleintra / sbs4.py View on Github external
def trimSoup(bs):
    '''Trim "body" of bs for whitespace including <br>'''
    for rev in [False, True]:
        children = list(bs.children)
        if rev:
            children = reversed(children)
        for c in children:
            if isinstance(c, bs4.element.Tag):
                if c.name == 'br':
                    c.extract()
                    continue
            if isinstance(c, bs4.element.NavigableString):
                text = c.string
                text = text.rstrip() if rev else text.lstrip()
                if not text:
                    c.extract()
                    continue
                c.string.replace_with(text)
            break
github myd7349 / Ongoing-Study / python / douban_movie_top250 / douban_movie_top250_v0.py View on Github external
with urllib.request.urlopen(start_url) as response:
        start_page = bs4.BeautifulSoup(response, features) # Use lxml's HTML parser if possible
        paginator = start_page.find('div', 'paginator')
        for page_href in paginator.find_all('a'):
            url = start_url + page_href.get('href')
            if url not in urls:
                urls.append(url)

    with tqdm.tqdm(total=250) as pbar:
        for url in urls:
            with urllib.request.urlopen(url) as response:
                soup = bs4.BeautifulSoup(response, features)
                movies_list = soup.find(id = 'content').ol
                for movie_item in movies_list:
                    if isinstance(movie_item, bs4.element.Tag):
                        movie_no = int(movie_item.em.string)
                        movie_title = movie_item.find('span', 'title').string
                        print('{:03d}: {}'.format(movie_no, movie_title), file=file)
                        pbar.update(1)
github snarfed / indie-map / src / sites_to_bigquery.py View on Github external
def add(self, item):
        if isinstance(item, bs4.element.Tag):
            item = item.string
        if item:
            self[item] = None
github MechanicalSoup / MechanicalSoup / mechanicalsoup / stateful_browser.py View on Github external
:param selector: CSS selector or a bs4.element.Tag object to identify
            the form to select.
            If not specified, ``selector`` defaults to "form", which is
            useful if, e.g., there is only one form on the page.
            For ``selector`` syntax, see the `.select() method in BeautifulSoup
            `__.
        :param nr: A zero-based index specifying which form among those that
            match ``selector`` will be selected. Useful when one or more forms
            have the same attributes as the form you want to select, and its
            position on the page is the only way to uniquely identify it.
            Default is the first matching form (``nr=0``).

        :return: The selected form as a soup object. It can also be
            retrieved later with the :attr:`form` attribute.
        """
        if isinstance(selector, bs4.element.Tag):
            if selector.name != "form":
                raise LinkNotFoundError
            self.__state.form = Form(selector)
        else:
            # nr is a 0-based index for consistency with mechanize
            found_forms = self.page.select(selector,
                                           limit=nr + 1)
            if len(found_forms) != nr + 1:
                if self.__debug:
                    print('select_form failed for', selector)
                    self.launch_browser()
                raise LinkNotFoundError()
            self.__state.form = Form(found_forms[-1])

        return self.form
github IFGHou / wapiti / wapitiCore / attack / mod_xss.py View on Github external
elif keyword in bs_node.name:
                    # print("Found in tag name")
                    noscript = self.closeNoscript(bs_node)
                    d = {"type": "tag", "value": bs_node.name, "noscript": noscript}
                    if d not in entries:
                        entries.append(d)
                # recursively search injection points for the same variable
                for x in bs_node.contents:
                    self.study(x, parent=bs_node, keyword=keyword, entries=entries)
            elif isinstance(bs_node, element.Comment):
                # print("Found in comment, tag {0}".format(parent.name))
                noscript = self.closeNoscript(bs_node)
                d = {"type": "comment", "parent": parent.name, "noscript": noscript}
                if d not in entries:
                    entries.append(d)
            elif isinstance(bs_node, element.NavigableString):
                # print("Found in text, tag {0}".format(parent.name))
                noscript = self.closeNoscript(bs_node)
                d = {"type": "text", "parent": parent.name, "noscript": noscript}
                if d not in entries:
                    entries.append(d)
github mozilla / OpenWPM / automation / Commands / utils / XPathUtil.py View on Github external
def ExtractXPath(element, use_id=True):
    # Check that element is a tag node
    if type(element) != bs4.element.Tag:
        raise ExtractXPathError(
            '%s is not a supported data type. '
            'Only tag nodes from the tag tree are accepted.'
            % type(element)
        )

    # Starting node
    # Check id first
    if use_id and element.get('id') is not None:
        return '//*/' + element.name + '[@id="' + element.get('id') + '"]'

    xpath = check_previous_tags(element)

    # Parent Nodes
    for parent in element.parents:
        # End of XPath - exclude from string