Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
merged_css_file.close()
merged_css_data.close()
# make actions independent from HTML content but required for correct work of scripts.
makeCustomActions(input_html_dir, output_html_dir)
# remove comments from html.
for element in list( doc.getroot().iter(Comment) ): # iterate through copy of list since we need to remove elements from original list
#print html.tostring(element)
element.getparent().remove(element)
# create new html file
new_html_path = os.path.join( output_html_dir, os.path.split(html_filename)[1] )
html_file = open(new_html_path, 'w')
print >> html_file, doc.docinfo.doctype
print >> html_file, html.tostring(doc, pretty_print=True, include_meta_content_type=True, encoding='utf-8')
html_file.close()
return SUCCESS_CODE
if cid_mapping and message_data.get('body'):
root = lxml.html.fromstring(tools.ustr(message_data['body']))
postprocessed = False
for node in root.iter('img'):
if node.get('src', '').startswith('cid:'):
cid = node.get('src').split('cid:')[1]
attachment = cid_mapping.get(cid)
if not attachment:
attachment = fname_mapping.get(node.get('data-filename'), '')
if attachment:
attachment.generate_access_token()
node.set('src', '/web/image/%s?access_token=%s' % (attachment.id, attachment.access_token))
postprocessed = True
if postprocessed:
body = lxml.html.tostring(root, pretty_print=False, encoding='UTF-8')
message_data['body'] = body
return m2m_attachment_ids
timeout=600.,))
final_url = urlsplit(out['url'])._replace(query='', fragment='').geturl()
# Ensure there are no scripts to be executed.
out['html'] = w3lib.html.remove_tags_with_content(out['html'], ('script',))
root = html.fromstring(out['html'], parser=html.HTMLParser(),
base_url=final_url)
try:
head = root.xpath('./head')[0]
except IndexError:
head = html.Element('head')
root.insert(0, head)
if not head.xpath('./base/@href'):
head.insert(0, html.Element('base', {'href': final_url}))
if not head.xpath('./meta/@charset'):
head.insert(0, html.Element('meta', {'charset': 'utf-8'}))
out['html'] = html.tostring(root, encoding='utf-8',
doctype='')
filename = re.sub(r'[^\w]+', '_', url) + '.html'
with open(os.path.join(sites_dir, filename), 'w') as f:
f.write(out['html'])
return filename
def load_player(self, member_url, team, char_name=None):
""" Loads player and team membership data, and adds as member to team. Return profile, membership """
try:
member_d = self.visit_url(member_url)
except IOError:
profile_name = " ".join((word.capitalize() for word in member_url.strip("/").split("/")[-1].split("-")))
print("Page not found, constructing from {0} name and {1} charname".format(profile_name, char_name))
# create profile and membership
profile, created = Profile(name=profile_name, user=self.master_user), True
profile.save()
membership = TeamMembership(team=team, profile=profile, char_name=char_name, active=False)
membership.save()
return profile, membership
if "Player not found in database" in tostring(member_d):
print("Player not found...skipping", file=self.stdout)
return
info_ps = member_d.cssselect('.content-section-1 p')
info_h3s = member_d.cssselect('.content-section-1 h3')
profile_name = info_ps[1].text
if char_name is None:
char_name = info_ps[4].text
if "." in char_name:
char_name = char_name.split(".", 1)[0]
if Profile.objects.filter(name=profile_name).count():
profile, created = Profile.objects.get(name=profile_name), False
membership, membership_created = TeamMembership.objects.get_or_create(team=team, profile=profile, defaults={'char_name': char_name})
membership.char_name = char_name
else:
try:
membership = TeamMembership.objects.get(team=team, char_name=char_name)
'spanish':'spa'}.get(lang, None)
if lang:
mi.language = lang
if ebook_isbn:
# print("ebook isbn is "+type('')(ebook_isbn[0]))
isbn = check_isbn(ebook_isbn[0].strip())
if isbn:
self.cache_isbn_to_identifier(isbn, ovrdrv_id)
mi.isbn = isbn
if subjects:
mi.tags = [tag.strip() for tag in subjects[0].split(',')]
if desc:
desc = desc[0]
desc = html.tostring(desc, method='html', encoding='unicode').strip()
# remove all attributes from tags
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
# Remove comments
desc = re.sub(r'(?s)', '', desc)
mi.comments = sanitize_comments_html(desc)
return None
def get_profile_page_html(linkedin_id):
profile_page_html = None
while not profile_page_html:
session = dryscrape.Session(base_url="https://www.linkedin.com/in/")
session.visit(linkedin_id)
profile_page_html = lxml.html.tostring(session.document())
del session
else:
return profile_page_html
def set_meta_description(text):
document = document_fromstring(obj.HTML)
try:
document.findall('.//head/meta[@name="description"]')[0].attrib['content'] = text
except IndexError:
pass
obj.HTML = tostring(document)
container.drop_tree()
# The previous block handles most inline images, but for messages
# where the entire markdown input was just the URL of an image
# (i.e. the entire body is a message_inline_image object), the
# entire message body will be that image element; here, we need a
# more drastic edit to the content.
if fragment.get('class') == 'message_inline_image':
content_template = '<p><a title="%s" href="%s">%s</a></p>'
image_link = fragment.find('a').get('href')
image_title = fragment.find('a').get('title')
new_content = (content_template % (image_link, image_title, image_link))
fragment = lxml.html.fromstring(new_content)
fragment.make_links_absolute(base_url)
content = lxml.html.tostring(fragment).decode("utf-8")
return content
def make_parent_line(node, attach_head=False, question_contains=None):
# Add how much text context is given. e.g. 2 would mean 2 parent's text
# nodes are also displayed
if question_contains is not None:
newstr = does_this_element_contain(question_contains, lxml.html.tostring(node))
else:
newstr = lxml.html.tostring(node)
parent = node.getparent()
while parent is not None:
if attach_head and parent.tag == 'html':
newstr = lxml.html.tostring(parent.find(
'.//head'), encoding='utf8').decode('utf8') + newstr
tag, items = parent.tag, parent.items()
attrs = " ".join(['{}="{}"'.format(x[0], x[1]) for x in items if len(x) == 2])
newstr = '<{} {}>{}'.format(tag, attrs, newstr, tag)
parent = parent.getparent()
return newstr
def _get_inner_text(html_node):
"""Returns the plaintext of an HTML node.
This turns out to do exactly what we want:
- strips out <br>s and other markup
- replace <a> tags with just their text
- converts HTML entities like and smart quotes into their
unicode equivalents
"""
return lxml.html.tostring(html_node, encoding='utf-8',
method='text', with_tail=False).decode('utf-8')
</a>