How to use the lxml.html.tostring function in lxml

To help you get started, we’ve selected a few lxml examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github a0ivanov / aimp-control-plugin / tools / html_optimizer.py View on Github external
merged_css_file.close()    
    merged_css_data.close()

    # make actions independent from HTML content but required for correct work of scripts.
    makeCustomActions(input_html_dir, output_html_dir)

    # remove comments from html.
    for element in list( doc.getroot().iter(Comment) ): # iterate through copy of list since we need to remove elements from original list
        #print html.tostring(element)
        element.getparent().remove(element)
    
    # create new html file
    new_html_path = os.path.join( output_html_dir, os.path.split(html_filename)[1] )
    html_file = open(new_html_path, 'w')
    print >> html_file, doc.docinfo.doctype
    print >> html_file, html.tostring(doc, pretty_print=True, include_meta_content_type=True, encoding='utf-8')
    html_file.close()
    
    return SUCCESS_CODE
github odoo / odoo / addons / mail / models / mail_thread.py View on Github external
if cid_mapping and message_data.get('body'):
            root = lxml.html.fromstring(tools.ustr(message_data['body']))
            postprocessed = False
            for node in root.iter('img'):
                if node.get('src', '').startswith('cid:'):
                    cid = node.get('src').split('cid:')[1]
                    attachment = cid_mapping.get(cid)
                    if not attachment:
                        attachment = fname_mapping.get(node.get('data-filename'), '')
                    if attachment:
                        attachment.generate_access_token()
                        node.set('src', '/web/image/%s?access_token=%s' % (attachment.id, attachment.access_token))
                        postprocessed = True
            if postprocessed:
                body = lxml.html.tostring(root, pretty_print=False, encoding='UTF-8')
                message_data['body'] = body

        return m2m_attachment_ids
github scrapinghub / splash / splash / benchmark / download_sites.py View on Github external
timeout=600.,))
    final_url = urlsplit(out['url'])._replace(query='', fragment='').geturl()
    # Ensure there are no scripts to be executed.
    out['html'] = w3lib.html.remove_tags_with_content(out['html'], ('script',))
    root = html.fromstring(out['html'], parser=html.HTMLParser(),
                           base_url=final_url)
    try:
        head = root.xpath('./head')[0]
    except IndexError:
        head = html.Element('head')
        root.insert(0, head)
    if not head.xpath('./base/@href'):
        head.insert(0, html.Element('base', {'href': final_url}))
    if not head.xpath('./meta/@charset'):
        head.insert(0, html.Element('meta', {'charset': 'utf-8'}))
    out['html'] = html.tostring(root, encoding='utf-8',
                                doctype='')
    filename = re.sub(r'[^\w]+', '_', url) + '.html'
    with open(os.path.join(sites_dir, filename), 'w') as f:
        f.write(out['html'])
    return filename
github ahgl / ahgl-site / ahgl / tournaments / management / commands / scrape_ahgl.py View on Github external
def load_player(self, member_url, team, char_name=None):
        """ Loads player and team membership data, and adds as member to team. Return profile, membership """
        try:
            member_d = self.visit_url(member_url)
        except IOError:
            profile_name = " ".join((word.capitalize() for word in member_url.strip("/").split("/")[-1].split("-")))
            print("Page not found, constructing from {0} name and {1} charname".format(profile_name, char_name))
            # create profile and membership
            profile, created = Profile(name=profile_name, user=self.master_user), True
            profile.save()
            membership = TeamMembership(team=team, profile=profile, char_name=char_name, active=False)
            membership.save()
            return profile, membership

        if "Player not found in database" in tostring(member_d):
            print("Player not found...skipping", file=self.stdout)
            return
        info_ps = member_d.cssselect('.content-section-1 p')
        info_h3s = member_d.cssselect('.content-section-1 h3')
        profile_name = info_ps[1].text
        if char_name is None:
            char_name = info_ps[4].text
            if "." in char_name:
                char_name = char_name.split(".", 1)[0]
        if Profile.objects.filter(name=profile_name).count():
            profile, created = Profile.objects.get(name=profile_name), False
            membership, membership_created = TeamMembership.objects.get_or_create(team=team, profile=profile, defaults={'char_name': char_name})
            membership.char_name = char_name
        else:
            try:
                membership = TeamMembership.objects.get(team=team, char_name=char_name)
github kovidgoyal / calibre / src / calibre / ebooks / metadata / sources / overdrive.py View on Github external
'spanish':'spa'}.get(lang, None)
            if lang:
                mi.language = lang

        if ebook_isbn:
            # print("ebook isbn is "+type('')(ebook_isbn[0]))
            isbn = check_isbn(ebook_isbn[0].strip())
            if isbn:
                self.cache_isbn_to_identifier(isbn, ovrdrv_id)
                mi.isbn = isbn
        if subjects:
            mi.tags = [tag.strip() for tag in subjects[0].split(',')]

        if desc:
            desc = desc[0]
            desc = html.tostring(desc, method='html', encoding='unicode').strip()
            # remove all attributes from tags
            desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
            # Remove comments
            desc = re.sub(r'(?s)', '', desc)
            mi.comments = sanitize_comments_html(desc)

        return None
github M157q / linkedin2md / linkedin2md / linkedin2md.py View on Github external
def get_profile_page_html(linkedin_id):
    profile_page_html = None

    while not profile_page_html:
        session = dryscrape.Session(base_url="https://www.linkedin.com/in/")
        session.visit(linkedin_id)
        profile_page_html = lxml.html.tostring(session.document())
        del session
    else:
        return profile_page_html
github letolab / airy / airy / core / decorators.py View on Github external
def set_meta_description(text):
            document = document_fromstring(obj.HTML)
            try:
                document.findall('.//head/meta[@name="description"]')[0].attrib['content'] = text
            except IndexError:
                pass
            obj.HTML = tostring(document)
github zulip / zulip / zerver / lib / email_notifications.py View on Github external
container.drop_tree()

    # The previous block handles most inline images, but for messages
    # where the entire markdown input was just the URL of an image
    # (i.e. the entire body is a message_inline_image object), the
    # entire message body will be that image element; here, we need a
    # more drastic edit to the content.
    if fragment.get('class') == 'message_inline_image':
        content_template = '<p><a title="%s" href="%s">%s</a></p>'
        image_link = fragment.find('a').get('href')
        image_title = fragment.find('a').get('title')
        new_content = (content_template % (image_link, image_title, image_link))
        fragment = lxml.html.fromstring(new_content)

    fragment.make_links_absolute(base_url)
    content = lxml.html.tostring(fragment).decode("utf-8")

    return content
github kootenpv / requests_viewer / requests_viewer / web.py View on Github external
def make_parent_line(node, attach_head=False, question_contains=None):
    # Add how much text context is given. e.g. 2 would mean 2 parent's text
    # nodes are also displayed
    if question_contains is not None:
        newstr = does_this_element_contain(question_contains, lxml.html.tostring(node))
    else:
        newstr = lxml.html.tostring(node)
    parent = node.getparent()
    while parent is not None:
        if attach_head and parent.tag == 'html':
            newstr = lxml.html.tostring(parent.find(
                './/head'), encoding='utf8').decode('utf8') + newstr
        tag, items = parent.tag, parent.items()
        attrs = " ".join(['{}="{}"'.format(x[0], x[1]) for x in items if len(x) == 2])
        newstr = '&lt;{} {}&gt;{}'.format(tag, attrs, newstr, tag)
        parent = parent.getparent()
    return newstr
github vim-awesome / vim-awesome / tools / scrape / vimorg.py View on Github external
def _get_inner_text(html_node):
    """Returns the plaintext of an HTML node.

    This turns out to do exactly what we want:
        - strips out <br>s and other markup
        - replace <a> tags with just their text
        - converts HTML entities like &nbsp; and smart quotes into their
          unicode equivalents
    """
    return lxml.html.tostring(html_node, encoding='utf-8',
            method='text', with_tail=False).decode('utf-8')
</a>