How to use the bs4.builder.HTMLTreeBuilder function in bs4

To help you get started, we’ve selected a few bs4 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github betonme / e2openplugin-SeriesPlugin / src / Identifiers / bs4 / builder / _html5lib.py View on Github external
PERMISSIVE,
    HTML,
    HTML_5,
    HTMLTreeBuilder,
    )
from bs4.element import NamespacedAttribute
import html5lib
from html5lib.constants import namespaces
from bs4.element import (
    Comment,
    Doctype,
    NavigableString,
    Tag,
    )

class HTML5TreeBuilder(HTMLTreeBuilder):
    """Use html5lib to build a tree."""

    features = ['html5lib', PERMISSIVE, HTML_5, HTML]

    def prepare_markup(self, markup, user_specified_encoding):
        # Store the user-specified encoding for use later on.
        self.user_specified_encoding = user_specified_encoding
        return markup, None, None, False

    # These methods are defined by Beautiful Soup.
    def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
        doc = parser.parse(markup, encoding=self.user_specified_encoding)
github pde / tosback2-data / web-frontend / beautifulsoup4 / build / lib / bs4 / builder / _html5lib.py View on Github external
PERMISSIVE,
    HTML,
    HTML_5,
    HTMLTreeBuilder,
    )
from bs4.element import NamespacedAttribute
import html5lib
from html5lib.constants import namespaces
from bs4.element import (
    Comment,
    Doctype,
    NavigableString,
    Tag,
    )

class HTML5TreeBuilder(HTMLTreeBuilder):
    """Use html5lib to build a tree."""

    features = ['html5lib', PERMISSIVE, HTML_5, HTML]

    def prepare_markup(self, markup, user_specified_encoding):
        # Store the user-specified encoding for use later on.
        self.user_specified_encoding = user_specified_encoding
        return markup, None, None, False

    # These methods are defined by Beautiful Soup.
    def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
        doc = parser.parse(markup, encoding=self.user_specified_encoding)
github CouchPotato / CouchPotatoServer / libs / bs4 / builder / _lxml.py View on Github external
self.soup.endData()
        doctype = Doctype.for_name_and_ids(name, pubid, system)
        self.soup.object_was_parsed(doctype)

    def comment(self, content):
        "Handle comments as Comment objects."
        self.soup.endData()
        self.soup.handle_data(content)
        self.soup.endData(Comment)

    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
        return u'\n%s' % fragment


class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):

    features = [LXML, HTML, FAST, PERMISSIVE]
    is_xml = False

    def default_parser(self, encoding):
        return etree.HTMLParser

    def feed(self, markup):
        encoding = self.soup.original_encoding
        try:
            self.parser = self.parser_for(encoding)
            self.parser.feed(markup)
            self.parser.close()
        except (UnicodeDecodeError, LookupError, etree.ParserError), e:
            raise ParserRejectedMarkup(str(e))
github nnsun / CourseGrab / CourseGrab / WebJob / bs4 / builder / _html5lib.py View on Github external
HTMLTreeBuilder,
    )
from bs4.element import (
    NamespacedAttribute,
    whitespace_re,
)
import html5lib
from html5lib.constants import namespaces
from bs4.element import (
    Comment,
    Doctype,
    NavigableString,
    Tag,
    )

class HTML5TreeBuilder(HTMLTreeBuilder):
    """Use html5lib to build a tree."""

    NAME = "html5lib"

    features = [NAME, PERMISSIVE, HTML_5, HTML]

    def prepare_markup(self, markup, user_specified_encoding,
                       document_declared_encoding=None, exclude_encodings=None):
        # Store the user-specified encoding for use later on.
        self.user_specified_encoding = user_specified_encoding

        # document_declared_encoding and exclude_encodings aren't used
        # ATM because the html5lib TreeBuilder doesn't use
        # UnicodeDammit.
        if exclude_encodings:
            warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
github maximilianh / pubMunch / lib / bs4 / builder / _lxml.py View on Github external
self.soup.endData()
        doctype = Doctype.for_name_and_ids(name, pubid, system)
        self.soup.object_was_parsed(doctype)

    def comment(self, content):
        "Handle comments as Comment objects."
        self.soup.endData()
        self.soup.handle_data(content)
        self.soup.endData(Comment)

    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
        return '\n%s' % fragment


class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):

    features = [LXML, HTML, FAST, PERMISSIVE]
    is_xml = False

    @property
    def default_parser(self):
        return etree.HTMLParser

    def feed(self, markup):
        self.parser.feed(markup)
        self.parser.close()

    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
        return '%s' % fragment
github h3llrais3r / Auto-Subliminal / libpy3 / bs4 / builder / _html5lib.py View on Github external
Comment,
    Doctype,
    NavigableString,
    Tag,
    )

try:
    # Pre-0.99999999
    from html5lib.treebuilders import _base as treebuilder_base
    new_html5lib = False
except ImportError as e:
    # 0.99999999 and up
    from html5lib.treebuilders import base as treebuilder_base
    new_html5lib = True

class HTML5TreeBuilder(HTMLTreeBuilder):
    """Use html5lib to build a tree."""

    NAME = "html5lib"

    features = [NAME, PERMISSIVE, HTML_5, HTML]

    def prepare_markup(self, markup, user_specified_encoding,
                       document_declared_encoding=None, exclude_encodings=None):
        # Store the user-specified encoding for use later on.
        self.user_specified_encoding = user_specified_encoding

        # document_declared_encoding and exclude_encodings aren't used
        # ATM because the html5lib TreeBuilder doesn't use
        # UnicodeDammit.
        if exclude_encodings:
            warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
github rembo10 / headphones / bs4 / builder / _lxml.py View on Github external
self.soup.endData()
        doctype = Doctype.for_name_and_ids(name, pubid, system)
        self.soup.object_was_parsed(doctype)

    def comment(self, content):
        "Handle comments as Comment objects."
        self.soup.endData()
        self.soup.handle_data(content)
        self.soup.endData(Comment)

    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
        return u'\n%s' % fragment


class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):

    features = [LXML, HTML, FAST, PERMISSIVE]
    is_xml = False

    @property
    def default_parser(self):
        return etree.HTMLParser

    def feed(self, markup):
        self.parser.feed(markup)
        self.parser.close()

    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
        return u'%s' % fragment
github kovidgoyal / html5-parser / src / html5_parser / soup.py View on Github external
def init_bs4_cdata_list_attributes():
    global cdata_list_attributes, universal_cdata_list_attributes
    from bs4.builder import HTMLTreeBuilder
    try:
        attribs = HTMLTreeBuilder.DEFAULT_CDATA_LIST_ATTRIBUTES
    except AttributeError:
        attribs = HTMLTreeBuilder.cdata_list_attributes

    cdata_list_attributes = {k: frozenset(v) for k, v in attribs.items()}
    universal_cdata_list_attributes = cdata_list_attributes['*']
github kovidgoyal / html5-parser / src / html5_parser / soup.py View on Github external
def init_bs4_cdata_list_attributes():
    global cdata_list_attributes, universal_cdata_list_attributes
    from bs4.builder import HTMLTreeBuilder
    try:
        attribs = HTMLTreeBuilder.DEFAULT_CDATA_LIST_ATTRIBUTES
    except AttributeError:
        attribs = HTMLTreeBuilder.cdata_list_attributes

    cdata_list_attributes = {k: frozenset(v) for k, v in attribs.items()}
    universal_cdata_list_attributes = cdata_list_attributes['*']
github morpheus65535 / bazarr / libs3 / bs4 / builder / _lxml.py View on Github external
self.soup.endData()
        doctype = Doctype.for_name_and_ids(name, pubid, system)
        self.soup.object_was_parsed(doctype)

    def comment(self, content):
        "Handle comments as Comment objects."
        self.soup.endData()
        self.soup.handle_data(content)
        self.soup.endData(Comment)

    def test_fragment_to_document(self, fragment):
        """See `TreeBuilder`."""
        return '\n%s' % fragment


class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):

    NAME = LXML
    ALTERNATE_NAMES = ["lxml-html"]

    features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
    is_xml = False
    processing_instruction_class = ProcessingInstruction

    def default_parser(self, encoding):
        return etree.HTMLParser

    def feed(self, markup):
        encoding = self.soup.original_encoding
        try:
            self.parser = self.parser_for(encoding)
            self.parser.feed(markup)