Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if innerHTML:
innerHTML = str(innerHTML, "utf8")
if errors:
errors = str(errors, "utf8")
errors = errors.split("\n")
expected = str(expected, "utf8")
try:
if innerHTML:
document = p.parseFragment(io.BytesIO(input), innerHTML)
else:
try:
document = p.parse(io.BytesIO(input))
except constants.DataLossWarning:
sys.stderr.write("Test input causes known dataloss, skipping")
return
except:
errorMsg = "\n".join(["\n\nInput:", str(input, "utf8"),
"\nExpected:", expected,
"\nTraceback:", traceback.format_exc()])
self.assertTrue(False, errorMsg)
output = convertTreeDump(p.tree.testSerializer(document))
output = attrlist.sub(sortattrs, output)
expected = convertExpected(expected)
expected = attrlist.sub(sortattrs, expected)
errorMsg = "\n".join(["\n\nInput:", str(input, "utf8"),
"\nExpected:", expected,
"\nReceived:", output])
errorMsg = "\n".join(["\n\nInput:", str(input, "utf8"),
"\nExpected:", expected,
"\nTraceback:", traceback.format_exc()])
self.assertTrue(False, errorMsg)
output = convertTreeDump(p.tree.testSerializer(document))
output = attrlist.sub(sortattrs, output)
expected = convertExpected(expected)
expected = attrlist.sub(sortattrs, expected)
errorMsg = "\n".join(["\n\nInput:", str(input, "utf8"),
"\nExpected:", expected,
"\nReceived:", output])
self.assertEquals(expected, output, errorMsg)
errStr = ["Line: %i Col: %i %s %s"%(line, col,
constants.E[errorcode], datavars) for
((line,col), errorcode, datavars) in p.errors]
errorMsg2 = "\n".join(["\n\nInput:", str(input, "utf8"),
"\nExpected errors (" + str(len(errors)) + "):\n" + "\n".join(errors),
"\nActual errors (" + str(len(p.errors)) + "):\n" + "\n".join(errStr)])
if checkParseErrors:
self.assertEquals(len(p.errors), len(errors), errorMsg2)
import os
import unittest
from support import simplejson, html5lib_test_files
from html5lib import html5parser, serializer, constants
from html5lib.treewalkers._base import TreeWalker
default_namespace = constants.namespaces["html"]
class JsonWalker(TreeWalker):
def __iter__(self):
for token in self.tree:
type = token[0]
if type == "StartTag":
if len(token) == 4:
namespace, name, attrib = token[1:]
else:
namespace = default_namespace
name, attrib = token[1:]
yield self.startTag(namespace, name, attrib)
elif type == "EndTag":
if len(token) == 3:
namespace, name = token[1:]
else:
if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
if token["namespace"] in constants.prefixes:
ns = constants.prefixes[token["namespace"]]
else:
ns = token["namespace"]
name = "%s %s" % (ns, token["name"])
else:
name = token["name"]
output.append("%s<%s>" % (" " * indent, name))
indent += 2
# attributes (sorted for consistent ordering)
attrs = token["data"]
for (namespace, localname), value in sorted(attrs.items()):
if namespace:
if namespace in constants.prefixes:
ns = constants.prefixes[namespace]
else:
ns = namespace
name = "%s %s" % (ns, localname)
else:
name = localname
output.append("%s%s=\"%s\"" % (" " * indent, name, value))
# self-closing
if type == "EmptyTag":
indent -= 2
elif type == "EndTag":
indent -= 2
elif type == "Comment":
output.append("%s" % (" " * indent, token["data"]))
if ns is not None:
elem.tag = '{%s}%s'%(ns, tag)
for b in tuple(elem.attrib):
idx = b.find('U0003A')
if idx > -1:
prefix, tag = b[:idx], b[idx+6:]
ns = elem.nsmap.get(prefix, None)
if ns is None:
ns = non_html5_namespaces.get(prefix, None)
if ns is not None:
elem.attrib['{%s}%s'%(ns, tag)] = elem.attrib.pop(b)
seen_namespaces |= set(elem.nsmap.itervalues())
nsmap = dict(html5lib.constants.namespaces)
nsmap[None] = nsmap.pop('html')
non_html5_namespaces.update(nsmap)
nsmap = non_html5_namespaces
data = clone_element(data, nsmap=nsmap, in_context=False)
# Remove unused namespace declarations
fnsmap = {k:v for k,v in nsmap.iteritems() if v in seen_namespaces and v !=
XMLNS_NS}
return clone_element(data, nsmap=fnsmap, in_context=False)
def handle(self, *args, **options):
# Not ideal, but we need to temporarily remove inline elemnents as a
# void/ignored element
# TO DO: Can this clone code be shortened?
new_void_set = set()
for item in html5lib_constants.voidElements:
new_void_set.add(item)
new_void_set.remove('link')
new_void_set.remove('img')
html5lib_constants.voidElements = frozenset(new_void_set)
# Create a mock request for the sake of rendering the template
request = RequestFactory().get('/')
request.LANGUAGE_CODE = settings.LANGUAGE_CODE
request.META['SERVER_NAME'] = 'developer.mozilla.org'
# Load the page with sphinx template
content = render(request, 'wiki/sphinx.html',
{'is_sphinx': True, 'gettext': ugettext}).content
# Use a filter to make links absolute
tool = parse(content, is_full_document=True)
content = tool.absolutizeAddresses(
base_url=settings.PRODUCTION_URL,
tag_attributes={
'a': 'href',
else:
rv.append("|%s"%(' '*indent, element.name))
else:
rv.append("|%s"%(' '*indent,))
elif element.nodeType == Node.DOCUMENT_NODE:
rv.append("#document")
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
rv.append("#document-fragment")
elif element.nodeType == Node.COMMENT_NODE:
rv.append("|%s"%(' '*indent, element.nodeValue))
elif element.nodeType == Node.TEXT_NODE:
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
else:
if (hasattr(element, "namespaceURI") and
element.namespaceURI != None):
name = "%s %s"%(constants.prefixes[element.namespaceURI],
element.nodeName)
else:
name = element.nodeName
rv.append("|%s<%s>"%(' '*indent, name))
if element.hasAttributes():
i = 0
attr = element.attributes.item(i)
while attr:
name = attr.nodeName
value = attr.value
ns = attr.namespaceURI
if ns:
name = "%s %s"%(constants.prefixes[ns], attr.localName)
else:
name = attr.nodeName
i += 1
elif isinstance(element, str):
#Text in a fragment
rv.append("|%s\"%s\""%(' '*indent, element))
else:
#Fragment case
rv.append("#document-fragment")
for next_element in element:
serializeElement(next_element, indent+2)
elif type(element.tag) == type(etree.Comment):
rv.append("|%s"%(' '*indent, element.text))
else:
nsmatch = etree_builders.tag_regexp.match(element.tag)
if nsmatch is not None:
ns = nsmatch.group(1)
tag = nsmatch.group(2)
prefix = constants.prefixes[ns]
rv.append("|%s<%s %s>"%(' '*indent, prefix,
filter.fromXmlName(tag)))
else:
rv.append("|%s<%s>"%(' '*indent,
filter.fromXmlName(element.tag)))
if hasattr(element, "attrib"):
for name, value in element.attrib.items():
nsmatch = etree_builders.tag_regexp.match(name)
if nsmatch:
ns = nsmatch.group(1)
name = nsmatch.group(2)
prefix = constants.prefixes[ns]
rv.append('|%s%s %s="%s"' % (' '*(indent+2),
prefix,
filter.fromXmlName(name),
elif isinstance(element, basestring):
#Text in a fragment
rv.append("|%s\"%s\""%(' '*indent, element))
else:
#Fragment case
rv.append("#document-fragment")
for next_element in element:
serializeElement(next_element, indent+2)
elif type(element.tag) == type(etree.Comment):
rv.append("|%s"%(' '*indent, element.text))
else:
nsmatch = etree_builders.tag_regexp.match(element.tag)
if nsmatch is not None:
ns = nsmatch.group(1)
tag = nsmatch.group(2)
prefix = constants.prefixes[ns]
rv.append("|%s<%s %s>"%(' '*indent, prefix,
filter.fromXmlName(tag)))
else:
rv.append("|%s<%s>"%(' '*indent,
filter.fromXmlName(element.tag)))
if hasattr(element, "attrib"):
attributes = []
for name, value in element.attrib.iteritems():
nsmatch = tag_regexp.match(name)
if nsmatch is not None:
ns, name = nsmatch.groups()
name = filter.fromXmlName(name)
prefix = constants.prefixes[ns]
attr_string = "%s %s"%(prefix, name)
else:
if nsmatch is None:
name = element.tag
else:
ns, name = nsmatch.groups()
prefix = constants.prefixes[ns]
name = "%s %s"%(prefix, name)
rv.append("|%s<%s>"%(' '*indent, name))
if hasattr(element, "attrib"):
attributes = []
for name, value in element.attrib.iteritems():
nsmatch = tag_regexp.match(name)
if nsmatch is not None:
ns, name = nsmatch.groups()
prefix = constants.prefixes[ns]
attr_string = "%s %s"%(prefix, name)
else:
attr_string = name
attributes.append((attr_string, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
indent += 2
for child in element:
serializeElement(child, indent)
if element.tail:
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
serializeElement(element, 0)