Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_all_tokens(self):
expected = [
{'data': [], 'type': 'StartTag', 'name': 'html'},
{'data': [], 'type': 'StartTag', 'name': 'head'},
{'data': [], 'type': 'EndTag', 'name': 'head'},
{'data': [], 'type': 'StartTag', 'name': 'body'},
{'data': 'a', 'type': 'Characters'},
{'data': [], 'type': 'StartTag', 'name': 'div'},
{'data': 'b', 'type': 'Characters'},
{'data': [], 'type': 'EndTag', 'name': 'div'},
{'data': 'c', 'type': 'Characters'},
{'data': [], 'type': 'EndTag', 'name': 'body'},
{'data': [], 'type': 'EndTag', 'name': 'html'}
]
for treeName, treeCls in treeTypes.items():
p = html5parser.HTMLParser(tree = treeCls["builder"])
document = p.parse("a<div>b</div>c")
document = treeCls.get("adapter", lambda x: x)(document)
output = treeCls["walker"](document)
for expectedToken, outputToken in zip(expected, output):
self.assertEquals(expectedToken, outputToken)
def sanitize_html(stream):
return ''.join([token.toxml() for token in
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
parseFragment(stream).childNodes])
def sanitize_html(self,stream):
return ''.join([token.toxml() for token in
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
parseFragment(stream).childNodes])
# Try opening from file system
f = open(f)
except IOError: pass
except IndexError:
sys.stderr.write("No filename provided. Use -h for help\n")
sys.exit(1)
treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)
if opts.sanitize:
tokenizer = sanitizer.HTMLSanitizer
else:
tokenizer = HTMLTokenizer
p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer)
if opts.fragment:
parseMethod = p.parseFragment
else:
parseMethod = p.parse
if opts.profile:
#XXX should import cProfile instead and use that
import hotshot
import hotshot.stats
prof = hotshot.Profile('stats.prof')
prof.runcall(parseMethod, f, encoding=encoding)
prof.close()
# XXX - We should use a temp file here
stats = hotshot.stats.load('stats.prof')
stats.strip_dirs()
def favicon(page):
parser = html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
doc = parser.parse(urlopen(page))
favicon = urljoin(page, '/favicon.ico')
for link in doc.getElementsByTagName('link'):
if link.hasAttribute('rel') and link.hasAttribute('href'):
if 'icon' in link.attributes['rel'].value.lower().split(' '):
favicon = urljoin(page, link.attributes['href'].value)
if urlopen(favicon).info()['content-length'] != '0':
return favicon
f = open(f, "rb")
except IOError as e:
sys.stderr.write("Unable to open file: %s\n" % e)
sys.exit(1)
except IndexError:
sys.stderr.write("No filename provided. Use -h for help\n")
sys.exit(1)
treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)
if opts.sanitize:
tokenizer = sanitizer.HTMLSanitizer
else:
tokenizer = HTMLTokenizer
p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer, debug=opts.log)
if opts.fragment:
parseMethod = p.parseFragment
else:
parseMethod = p.parse
if opts.profile:
import cProfile
import pstats
cProfile.runctx("run(parseMethod, f, encoding)", None,
{"run": run,
"parseMethod": parseMethod,
"f": f,
"encoding": encoding},
"stats.prof")
# XXX - We should use a temp file here
def camoify(ctx, value):
request = ctx.get("request") or get_current_request()
# Parse the rendered output and replace any inline images that don't point
# to HTTPS with camouflaged images.
tree_builder = html5lib.treebuilders.getTreeBuilder("dom")
parser = html5lib.html5parser.HTMLParser(tree=tree_builder)
dom = parser.parse(value)
for element in dom.getElementsByTagName("img"):
src = element.getAttribute("src")
if src:
element.setAttribute("src", request.camo_url(src))
tree_walker = html5lib.treewalkers.getTreeWalker("dom")
html_serializer = html5lib.serializer.HTMLSerializer()
camoed = "".join(html_serializer.serialize(tree_walker(dom)))
return camoed
'w3c_compat': True,
'w3c_compat_xref_a_placement': False,
'w3c_compat_xref_elements': False,
'w3c_compat_xref_normalization': False,
}
if "anolis" in conf:
opts.update(conf["anolis"])
if spec == "srcset":
print 'munging (before anolis)'
filtered.seek(0)
pre_anolis_buffer = StringIO()
# Parse
parser = html5lib.html5parser.HTMLParser(tree = html5lib.treebuilders.getTreeBuilder('lxml'))
tree = parser.parse(filtered, encoding='utf-8')
# Move introduction above conformance requirements
introduction = tree.findall("//*[@id='introduction']")[0]
intro_ps = introduction.xpath("following-sibling::*")
target = tree.findall("//*[@id='conformance-requirements']")[0]
target.addprevious(introduction)
target = introduction
target.addnext(intro_ps[2])
target.addnext(intro_ps[1])
target.addnext(intro_ps[0])
# Serialize
tokens = html5lib.treewalkers.getTreeWalker('lxml')(tree)
serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False)
for text in serializer.serialize(tokens, encoding='utf-8'):
if isinstance(detail.value,unicode):
detail.value=detail.value.encode('utf-8')
if not detail.has_key('type') or detail.type.lower().find('html')<0:
detail['value'] = escape(detail.value)
detail['type'] = 'text/html'
if detail.type.find('xhtml')>=0 and not bozo:
try:
data = minidom.parseString(xdiv % detail.value).documentElement
xcontent.setAttribute('type', 'xhtml')
except:
bozo=1
if detail.type.find('xhtml')<0 or bozo:
parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
html = parser.parse(xdiv % detail.value, encoding="utf-8")
for body in html.documentElement.childNodes:
if body.nodeType != Node.ELEMENT_NODE: continue
if body.nodeName != 'body': continue
for div in body.childNodes:
if div.nodeType != Node.ELEMENT_NODE: continue
if div.nodeName != 'div': continue
try:
div.normalize()
if len(div.childNodes) == 1 and \
div.firstChild.nodeType == Node.TEXT_NODE:
data = div.firstChild
if illegal_xml_chars.search(data.data):
data = xdoc.createTextNode(
illegal_xml_chars.sub(invalidate, data.data))
else:
def extract_images(data, plugin):
"""
extracts base64 encoded images from drag and drop actions in browser and saves
those images as plugins
"""
if not settings.TEXT_SAVE_IMAGE_FUNCTION:
return data
tree_builder = html5lib.treebuilders.getTreeBuilder('dom')
parser = html5lib.html5parser.HTMLParser(tree=tree_builder)
dom = parser.parse(data)
found = False
for img in dom.getElementsByTagName('img'):
src = img.getAttribute('src')
if not src.startswith('data:'):
# nothing to do
continue
width = img.getAttribute('width')
height = img.getAttribute('height')
# extract the image data
data_re = re.compile(r'data:(?P[^"]*);(?P[^"]*),(?P<data>[^"]*)')
m = data_re.search(src)
dr = m.groupdict()
mime_type = dr['mime_type']
image_data = dr['data']
if mime_type.find(';'):</data>