How to use the lxml.etree.HTML function in lxml

To help you get started, we’ve selected a few lxml examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github erudit / eruditorg / tests / functional / apps / userspace / journal / editor / test_views.py View on Github external
def test_cannot_upload_while_adding(self):
        """ Test upload widget absence in creation

        We need to save it before the IssueSubmission before
        uploading so that file chunks are associated with the issue
        submission."""
        user = UserFactory()
        journal = JournalFactory(members=[user])
        AuthorizationFactory.create_can_manage_issue_subscriptions(user, journal)

        client = Client(logged_user=user)
        response = client.get(
            reverse('userspace:journal:editor:add', kwargs={'journal_pk': journal.pk}))
        root = etree.HTML(response.content)
        assert len(root.cssselect('#id_submissions')) == 0
github erudit / eruditorg / erudit / editor / tests.py View on Github external
def test_cannot_upload_while_adding(self):
        """ Test upload widget absence in creation

        We need to save it before the IssueSubmission before
        uploading so that file chunks are associated with the issue
        submission."""
        self.client.login(username='david', password='top_secret')
        response = self.client.get(reverse('editor:add'))
        root = etree.HTML(response.content)
        self.assertFalse(
            root.cssselect('#id_submission_file'),
            "The rendered template should not contain an id_submission_file input"  # noqa
        )
github wackou / smewt / smewt / media / subtitle / subtitle_tvsubtitles_provider.py View on Github external
    @cachedmethod
    def getLikelySeriesUrl(self, name):
        data = urlencode({ 'q': name })
        html = etree.HTML(urlopen(self.baseUrl + '/search.php', data).read())
        matches = [ s.find('a') for s in html.findall(".//div[@style='']") ]

        # add baseUrl and remove year information
        result = []
        for match in matches:
            seriesID = int(match.get('href').split('-')[1].split('.')[0]) # remove potential season number
            seriesUrl = self.baseUrl + '/tvshow-%d.html' % seriesID
            title = match.text
            try:
                idx = title.find('(') - 1
                title = title[:idx]
            except: pass

            result.append({ 'title': title, 'url': seriesUrl })

        if not matches:
github qazbnm456 / VWGen / core / attack / mod_exec.py View on Github external
def generate_payloads(self, html_code, parent=None):
        e = []
        o = []
        l = []

        for index, line in enumerate(html_code.splitlines(), 1):
            o.append(line)
            l.append("{1}".format(index, line))

        tree = etree.HTML(decode_html("\n".join(l))).getroottree()
        self.study(tree, entries=e, lines=l, parent=parent)

        for elem in e:
            # <a href="inject_point"></a>
            if elem['type'] == "attrval":
                found_node = etree.HTML(l[int(elem['lineno']) - 1]).xpath("//*[@*[re:test(., '{0}', 'i')]]".format(
                    elem['identifier']), namespaces={'re': "http://exslt.org/regular-expressions"})
                if len(found_node) == 1:
                    self.generateHandler(tree_node=tree, o=o, elem=elem)
            # <a>
            elif elem['type'] == "attrname":
                found_node = etree.HTML(l[int(elem['lineno']) - 1]).xpath("//*[@*[re:test(name(.), '{0}', 'i')]]".format(
                    elem['identifier']), namespaces={'re': "http://exslt.org/regular-expressions"})
                if len(found_node) == 1:
                    self.generateHandler(tree_node=tree, o=o, elem=elem)
            # </a>
github ponyorm / pony / pony / xslt_.py View on Github external
def html2xml(x, encoding='ascii'):
    if hasattr(x, 'write_c14n'): return x
    if not isinstance(x, basestring):
        if hasattr(x, '__unicode__'): x = unicode(x)
        else: x = str(x)
    if isinstance(x, str): x = unicode(x, encoding)
    return etree.HTML(x)
github RobSis / astrobot / astrobot.py View on Github external
pass

        if "apod.nasa.gov" in url.netloc:
            try:
                file = urllib2.urlopen(url.geturl(), context=self.context)
                tree = etree.HTML(file.read())
                directUrl = tree.xpath('//img/@src')
                if len(directUrl):
                    return "http://apod.nasa.gov/apod/" + directUrl[0]
            except:
                pass

        if "wikipedia.org" in url.netloc and "File:" in url.path:
            try:
                file = urllib2.urlopen(url.geturl(), context=self.context)
                tree = etree.HTML(file.read())
                directUrl = tree.xpath('//div[@class="fullMedia"]/a/@href')[0]
                if len(directUrl):
                    return "http:" + directUrl
            except:
                pass

        return None
github JustForFunnnn / webspider / app / tasks / company.py View on Github external
def requests_company_detail_data(company_id):
    """请求公司详情页数据"""
    headers = generate_http_header()
    crawler_sleep()
    try:
        response = requests.get(
            url=constants.COMPANY_DETAIL_URL.format(company_id=company_id),
            headers=headers,
            cookies=Cookies.get_random_cookies(),
            allow_redirects=False,
            timeout=constants.TIMEOUT)
    except RequestException as e:
        logging.error(e)
        raise RequestsError(error_log=e)
    html = etree.HTML(response.text)
    advantage = html.xpath('//div[@id="tags_container"]//li/text()')
    size = html.xpath('//div[@id="basic_container"]//li[3]/span/text()')
    address = html.xpath('//p[@class="mlist_li_desc"]/text()')
    introduce = html.xpath('//span[@class="company_content"]//text()')

    return format_tag(advantage, address, size, introduce, company_id)
github offensive-security / exploitdb / platforms / linux / webapps / 41223.py View on Github external
def get_api_url(wordpress_url):
    response = urllib2.urlopen(wordpress_url)

    data = etree.HTML(response.read())
    u = data.xpath('//link[@rel="https://api.w.org/"]/@href')[0]

    # check if we have permalinks
    if 'rest_route' in u:
        print(' ! Warning, looks like permalinks are not enabled. This might not work!')

    return u
github leovan / SciHubEVA / scihub_api.py View on Github external
def get_captcha_info(self, pdf_captcha_response):
        """Get captcha information with PDF captcha response

        Args:
            pdf_captcha_response: PDF captcha response

        Returns:
            captcha_id: Captcha ID
            captcha_img_url: Captcha image URL

        """

        captcha_id, captcha_img_url = None, None

        html = etree.HTML(pdf_captcha_response.content)
        imgs = html.xpath('//img[@id="captcha"]')
        ids = html.xpath('//input[@name="id"]')

        if len(imgs) > 0 and len(ids) > 0:
            captcha_id = ids[0].attrib['value']
            captcha_img_src = imgs[0].attrib['src']

            if captcha_img_src.startswith('http'):
                captcha_img_url = captcha_img_src
            else:
                scheme, netloc, *_ = urlparse(pdf_captcha_response.url, scheme='http')
                captcha_img_url = scheme + '://' + netloc + captcha_img_src

        return captcha_id, captcha_img_url
github howie6879 / talospider / talospider / item.py View on Github external
def _get_html(cls, html, url, html_etree, params, **kwargs):
        if html:
            html = etree.HTML(html)
        elif url:
            if not kwargs.get('headers', None):
                kwargs['headers'] = {
                    "User-Agent": get_random_user_agent()
                }
            response = requests.get(url, params, **kwargs)
            response.raise_for_status()
            content = response.content
            charset = cchardet.detect(content)
            text = content.decode(charset['encoding'])
            html = etree.HTML(text)
        elif html_etree is not None:
            return html_etree
        else:
            raise ValueError("html(url or html_etree) is expected")
        return html