How to use the lxml.html.document_fromstring function in lxml

To help you get started, we’ve selected a few lxml examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Kozea / WeasyPrint / weasy / tests.py View on Github external
def foo():
    source = u"<p>今日は html5lib!"
#    doc = html5lib.parse(source, treebuilder="lxml")
    doc = html.document_fromstring(source)
#    print type(doc), repr(doc), str(doc), dir(doc)

    assert 2 + 2 == 5
    </p>
github erudit / eruditorg / eruditorg / apps / public / book / management / commands / import_books.py View on Github external
def get_unicode_root(fd):
    content = fd.read()
    doc = UnicodeDammit(content, is_html=True)
    parser = html.HTMLParser(encoding=doc.original_encoding)
    root = html.document_fromstring(content, parser=parser)
    return root
github jermnelson / Discover-Aristotle / aristotle / apps / discovery / templatetags / whitewhale_extras.py View on Github external
def harvest_latest():
    """Function retrieves latest snapshot from live CC site, uses xpath to 
    save portions of the site to cache."""
    try:
        cc_home = urllib2.urlopen(CC_URL).read()
    except urllib2.HTTPError, e:
        logging.error("Unable to open CC_URL of %s" % CC_URL)
    cc_tree = lxml.html.document_fromstring(cc_home)
    cc_tree.make_links_absolute(CC_URL)
    script_elements = cc_tree.xpath('//script')
    js_html = ''
    for script in script_elements:
        if script.text is None: 
            js_html += lxml.html.tostring(script)
    cache.set('cc-scripts',js_html)
    header = cc_tree.xpath('//div[@id="header-wrapper"]')[0]
    cache.set('cc-header',lxml.html.tostring(header))
    footer = cc_tree.xpath('//footer[@id="footer"]')[0]
    # Bug with harvested footer from library website, removing
    # for IE
    footer_scripts = footer.xpath('script')
    for script in footer_scripts:
        footer.remove(script)
    cache.set('cc-footer',lxml.html.tostring(footer))
github manga-py / manga-py / providers / mangachan_me.py View on Github external
def get_manga_name(url, get=None):
    global manga_name

    if len(manga_name):
        return manga_name.split('-', 1)[1]

    if re.search('/online/[^/]+', url):
        url = document_fromstring(get(url)).cssselect('.postload a.a-series-title.manga-title')[0].get('href')
    name = re.search('/[^/]+/(\d+)\-([^/]+)\\.html', url)

    if not name:
        raise UrlParseError()
    groups = name.groups()
    manga_name = '{}-{}'.format(groups[0], groups[1])

    return groups[1]
github ysim / songtext / libsongtext / base.py View on Github external
def page_html(self):
        return html.document_fromstring(self.response.text)
github manga-py / manga-py / providers / otakusmash_com.py View on Github external
def get_images(main_content=None, volume=None, get=None, post=None):
    content = get(volume)
    parser = document_fromstring(content)
    select = parser.cssselect('.mid .pager select[name="page"]')[0]

    images = []
    _img = __images_helper(parser, volume)

    items = select.cssselect('option + option')

    if _img:
        images.append(_img)
    for i in items:
        page = document_fromstring(get('{}{}/'.format(volume, i.get('value'))))

        _img = __images_helper(page, volume)
        if _img:
            images.append(_img)

    return images
github okfn / ckanext-pdeu / ckanext / pdeu / harvesters / data_publica.py View on Github external
def import_stage(self, harvest_object):
        log.debug('In DataPublicaHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error('Empty content for object %s' % harvest_object.id,harvest_object,'Import')
            return False
        try:
            package_dict = {}
            extras_dict = {}

            package_dict['id'] = harvest_object.guid
            doc = html.document_fromstring(harvest_object.content)
            for field in doc.findall(".//div"):
                if not 'field' in field.get('class', ''): continue
                name = field.find("label").text.strip()

                if name == 'Title':
                    package_dict['title'] = field.find("div").xpath("string()").strip()

                if name == 'Categories':
                    extras_dict['categories'] = []
                    for elem in field.findall("div[@class='input']"):
                        if not elem.text: continue
                        extras_dict['categories'].append(elem.text.strip())

                if name == 'Software Licence':
                    #TODO: what to do with these?
                    a = field.find("div/a")
github esetera / Booki / lib / booki / xhtml_utils.py View on Github external
def __init__(self, server, book, chapter_name, html, use_cache=False,
                 cache_dir=None):
        self.server = server
        self.book = book
        self.name = chapter_name
        self.use_cache = use_cache
        if cache_dir:
            self.image_cache = ImageCache(cache_dir)
        self.tree = lxml.html.document_fromstring(html)
github scrapinghub / webpager / webpager / features.py View on Github external
def clean_html(cls, html, encoding=None):
        parser = lxml.html.HTMLParser(encoding=encoding)

        if isinstance(html, unicode) and encoding is not None:
            html = html.encode(encoding)

        html = lxml.html.document_fromstring(html, parser=parser)
        return _cleaner.clean_html(html)
github manga-py / manga-py / providers / bato_to.py View on Github external
def get_volumes(content=None, url=None, get=None, post=None):
    items = document_fromstring(content).cssselect('.chapters_list a[href*="/reader#"]')
    return [i.get('href') for i in items]