Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def process_tiddler(content):
"""
Turn some content into a div element representing
a tiddler.
"""
content = _escape_brackets(content)
parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder('beautifulsoup'))
soup = parser.parse(content)
tiddler = soup.find('div')
return tiddler
self.http_status = h.http_code
if not rdfOutput : raise h
return self._generate_error_graph(graph, "HTTP Error: %s (%s)" % (h.http_code,h.msg), uri=name)
except Exception :
# Something nasty happened:-(
e = sys.exc_info()[1]
self.http_status = 500
if not rdfOutput : raise e
return self._generate_error_graph(graph, str(e), uri=name)
dom = None
try :
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import html5lib
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
dom = parser.parse(input)
return self.graph_from_DOM(dom, graph)
except ImportError :
msg = "HTML5 parser not available. Try installing html5lib "
raise ImportError(msg)
except Exception :
# Something nasty happened:-(
e = sys.exc_info()[1]
self.http_status = 400
if not rdfOutput : raise e
return self._generate_error_graph(graph, str(e), uri=name)
except Exception :
# Something nasty happened:-(
e = sys.exc_info()[1]
if isinstance(e, ImportError) :
def rewrite_links(src, src_base, url_filter, rewrite_base,
url_attributes = (('a', 'href'),)):
"""Rewrite URLs in src and return the result.
First, src is parsed as HTML. Second, all URLs found in the
document are resolved relative to src_base and the result passed to
the functions url_filter. If this returns False, the URL is resolved
again, this time relative to rewrite_base, and the result stored
back to the document. Finally, the updated document is serialized
as HTML and returned.
The keyword argument url_attributes is a sequence of (tag,
attribute) pairs that contain URLs to be rewritten.
"""
tree_builder = html5lib.treebuilders.getTreeBuilder('dom')
parser = html5lib.html5parser.HTMLParser(tree = tree_builder)
dom = parser.parse(src)
for tag, attr in url_attributes:
for e in dom.getElementsByTagName(tag):
u = e.getAttribute(attr)
if u and not url_filter(urljoin(src_base, u)):
rewritten = urljoin(rewrite_base, u)
if u != rewritten:
e.setAttribute(attr, rewritten)
tree_walker = html5lib.treewalkers.getTreeWalker('dom')
html_serializer = html5lib.serializer.htmlserializer.HTMLSerializer()
return u''.join(html_serializer.serialize(tree_walker(dom)))
self.http_status = h.http_code
if not rdfOutput : raise h
return self._generate_error_graph(graph, "HTTP Error: %s (%s)" % (h.http_code,h.msg), uri=name)
except Exception :
# Something nasty happened:-(
e = sys.exc_info()[1]
self.http_status = 500
if not rdfOutput : raise e
return self._generate_error_graph(graph, str(e), uri=name)
dom = None
try :
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import html5lib
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
dom = parser.parse(input)
return self.graph_from_DOM(dom, graph)
except ImportError :
msg = "HTML5 parser not available. Try installing html5lib "
raise ImportError(msg)
except Exception :
# Something nasty happened:-(
e = sys.exc_info()[1]
self.http_status = 400
if not rdfOutput : raise e
return self._generate_error_graph(graph, str(e), uri=name)
except Exception :
# Something nasty happened:-(
e = sys.exc_info()[1]
if isinstance(e, ImportError) :
"""
top_level_elements = parser.parseFragment(html)
container = Element(self.CONTAINER_TAG)
# Why lxml couldn't just have text nodes, I'll never understand.
# Text nodes that come other than first are automatically stuffed
# into the tail attrs of the preceding elements by html5lib.
if top_level_elements and isinstance(top_level_elements[0],
basestring):
container.text = top_level_elements.pop(0)
container.extend(top_level_elements)
return container
p = HTMLParser(tree=getTreeBuilder(self.TREEBUILDER))
self._root = really_parse_fragment(p, html)
headers = {'Connection': 'close'}
response, content = client.request(query, headers=headers)
if response.status != 200:
return None
ct = response.get('content-type', '')
if ';' in ct:
ct = ct[0:ct.find(';')]
ct = ct.strip()
if ct not in HTML_TYPES:
return None
# parse html
links = []
parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("etree", etree),
namespaceHTMLElements=False)
html = parser.parse(content)
for link in html.find("head").findall("link"):
if link.get("rel", "").lower() == "alternate":
linktype = link.get("type", "").lower()
if linktype in FEED_TYPES:
href = link.get("href", "")
if href:
feed_item = {'url': href, 'title': link.get("title", "")}
links.append(feed_item)
return links
except:
log.error("Error finding feed links at %s: %s" % (query, traceback.format_exc()))
return None
finally:
http.close_all(client)
def __init__(self, api='etree'):
# if no default implementation is defined for this api, set it to None
# to let getTreeBuilder() using the corresponding implementation.
implementation = self.defaults.get(api, None)
HTMLParser.__init__(self, tree=treebuilders.getTreeBuilder(api, implementation))