Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def import_wiki(filename='wiki', hostname='localhost', port=8080):
f = codecs.open(filename, encoding='utf-8')
wikitext = f.read()
f.close()
parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder('beautifulsoup'))
soup = parser.parse(wikitext)
store_area = soup.find('div', id='storeArea')
divs = store_area.findAll('div')
_do_recipe(hostname, port)
_do_bag(hostname, port)
for tiddler in divs:
_do_tiddler(hostname, port, tiddler)
:copyright: Copyright 2007-2019 by the Sphinx team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
import re
import xml.etree.cElementTree as ElementTree
from hashlib import md5
import pytest
from html5lib import getTreeBuilder, HTMLParser
from test_build_html import flat_dict, tail_check, check_xpath
from sphinx.util.docutils import is_html5_writer_available
TREE_BUILDER = getTreeBuilder('etree', implementation=ElementTree)
HTML_PARSER = HTMLParser(TREE_BUILDER, namespaceHTMLElements=False)
etree_cache = {}
@pytest.mark.skipif(not is_html5_writer_available(), reason='HTML5 writer is not available')
@pytest.fixture(scope='module')
def cached_etree_parse():
def parse(fname):
if fname in etree_cache:
return etree_cache[fname]
with (fname).open('rb') as fp:
etree = HTML_PARSER.parse(fp)
etree_cache.clear()
etree_cache[fname] = etree
return etree
url = answers.task.get_static_asset_url(url, use_data_urls=use_data_urls)
# Check final URL.
import urllib.parse
u = urllib.parse.urlparse(url)
# Allow data URLs in some cases.
if use_data_urls and allow_dataurl and u.scheme == "data":
return url
if u.scheme not in ("", "http", "https", "mailto"):
return "javascript:alert('Invalid link.');"
return url
import html5lib, xml.etree
dom = html5lib.HTMLParser().parseFragment(output)
for node in dom.iter():
if node.get("href"):
node.set("href", rewrite_url(node.get("href")))
if node.get("src"):
node.set("src", rewrite_url(node.get("src"), allow_dataurl=(node.tag == "{http://www.w3.org/1999/xhtml}img")))
output = html5lib.serialize(dom, quote_attr_values="always", omit_optional_tags=False, alphabetical_attributes=True)
# But the p's within p's fix gives us a lot of empty p's.
output = output.replace("<p></p>", "")
return output
raise ValueError("Cannot render %s to %s." % (template_format, output_format))
else:
raise ValueError("Invalid template format encountered: %s." % template_format)
b = time.time()
success = True
except Exception as e:
print("%s could not parse the markup." % parser)
traceback.print_exc()
if success:
print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
print("Raw lxml parsed the markup in %.2fs." % (b-a))
import html5lib
parser = html5lib.HTMLParser()
a = time.time()
parser.parse(data)
b = time.time()
print("Raw html5lib parsed the markup in %.2fs." % (b-a))
self.http_status = 500
# Something nasty happened:-(
if not rdfOutput : raise e
err = self.options.add_error(str(e), context = name)
self.options.processor_graph.add_http_context(err, 500)
return copyErrors(graph, self.options)
dom = None
try :
msg = ""
parser = None
if self.options.host_language == HostLanguage.html5 :
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import html5lib
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
if self.charset :
# This means the HTTP header has provided a charset, or the
# file is a local file when we suppose it to be a utf-8
dom = parser.parse(input, encoding=self.charset)
else :
# No charset set. The HTMLLib parser tries to sniff into the
# the file to find a meta header for the charset; if that
# works, fine, otherwise it falls back on window-...
dom = parser.parse(input)
try :
if isstring :
input.close()
input = self._get_input(name)
else :
input.seek(0)
def get_components_using_html5lib(html):
"""Find lesson components using the pure python html5lib library."""
parser = html5lib.HTMLParser(
tree=html5lib.treebuilders.getTreeBuilder('etree', cElementTree),
namespaceHTMLElements=False)
content = parser.parseFragment('<div>%s</div>' % html)[0]
components = []
for component in content.findall('.//*[@instanceid]'):
component_dict = {'cpt_name': component.tag}
component_dict.update(component.attrib)
components.append(component_dict)
return components
b = time.time()
success = True
except Exception as e:
print("%s could not parse the markup." % parser)
traceback.print_exc()
if success:
print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
print("Raw lxml parsed the markup in %.2fs." % (b-a))
import html5lib
parser = html5lib.HTMLParser()
a = time.time()
parser.parse(data)
b = time.time()
print("Raw html5lib parsed the markup in %.2fs." % (b-a))
serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False)
for text in serializer.serialize(tokens, encoding='utf-8'):
pre_anolis_buffer.write(text)
filtered = pre_anolis_buffer
# replace data-x with data-anolis-xref
print "fixing xrefs"
filtered.seek(0)
# Parse
builder = treebuilders.getTreeBuilder("lxml", etree)
try:
parser = html5lib.HTMLParser(tree=builder, namespaceHTMLElements=False)
except TypeError:
parser = html5lib.HTMLParser(tree=builder)
tree = parser.parse(filtered, encoding='utf-8')
# Move introduction above conformance requirements
data_x = tree.findall("//*[@data-x]")
non_alphanumeric_spaces = re.compile(r"[^a-zA-Z0-9 \-\_\/\|]+")
for refel in data_x:
refel.attrib["data-anolis-xref"] = refel.get("data-x")
if refel.tag == "dfn" and not refel.get("id", False) and refel.attrib["data-anolis-xref"]:
refel.attrib["id"] = generateID(refel.attrib["data-anolis-xref"], refel)
del refel.attrib["data-x"]
# utils.ids = {}
print 'indexing'
# filtered.seek(0)
# tree = generator.fromFile(filtered, **opts)
generator.process(tree, **opts)
b = time.time()
success = True
except Exception, e:
print "%s could not parse the markup." % parser
traceback.print_exc()
if success:
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
print "Raw lxml parsed the markup in %.2fs." % (b-a)
import html5lib
parser = html5lib.HTMLParser()
a = time.time()
parser.parse(data)
b = time.time()
print "Raw html5lib parsed the markup in %.2fs." % (b-a)