How to use the lxml.html.fromstring function in lxml

To help you get started, we’ve selected a few lxml examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github trac-hacks / trac-github / runtests.py View on Github external
# Extract the state from the redirect
                redirect_url = urlparse.urlparse(response.headers['Location'])
                params = urlparse.parse_qs(redirect_url.query, keep_blank_values=True)
                state = params['state'][0]  # this is a random value
                response = session.get(
                    URL + '/github/oauth',
                    params={
                        'code': '01234567890123456789',
                        'state': state
                    },
                    allow_redirects=False)
                self.assertEqual(response.status_code, 302)

                response = session.get(URL + '/prefs')
                self.assertEqual(response.status_code, 200)
                tree = html.fromstring(response.content)
                return (''.join(tree.xpath('//div[@id="warning"]/text()')).strip(),
                        tree.xpath('//input[@id="email"]/@value'))
            finally:
                # disable callback again
                updateMockData(self.mockdata, postcallback="")
github th3-z / kf2-magicked-admin / magicked_admin / server / chat / chat.py View on Github external
self.chat_request_url,
                    self.chat_request_payload,
                    timeout=2
                )
            except requests.exceptions.RequestException:
                logger.debug("Couldn't retrieve chat (RequestException) ({})"
                             .format(self.server.name))
                time.sleep(self.time_interval + 3)
                continue

            if response.text:
                # trailing new line ends up in list without the strip
                messages_html = response.text.strip().split("\r\n\r\n")

                for message_html in messages_html:
                    message_tree = html.fromstring(message_html)
                    # xpath returns a list but theres only ever one of each because i split earlier
                    username_arr = message_tree.xpath('//span[starts-with(@class,\'username\')]/text()')
                    message = message_tree.xpath('//span[@class="message"]/text()')[0]
                    if len(username_arr) < 1:
                        # toss username arg to cd parser?
                        # Add in a test here to parse CD
                        logger.debug("Message without username '{}' ({})"
                                     .format(message, self.server.name))
                        continue
                    username = username_arr[0]

                    user_type_arr = message_tree.xpath('//span[starts-with(@class,\'username\')]/@class')
                    if len(user_type_arr) < 1:
                        logger.debug("Message without user type '{}' ({})"
                                     .format(message, self.server.name))
                        continue
github laurentb / weboob / modules / sachsen / pages.py View on Github external
def parse(self, el):
                raw = self.extract.match(el.text).group("html")
                raw = raw.replace('\\"', '"').replace('\\n', '').replace('\\/', '/')
                parsed = lxml.html.fromstring(raw)

                self.env['name'] = CleanText('.//span[@class="popUpTitleBold"]')(parsed)
                self.env['object'] = CleanText('.//span[@class="popUpTitleNormal"]')(parsed).strip(' /')
                url = Attr('.//div[@class="popUpMsDiagramm"]/img', 'src')(parsed)
                self.env['id'] = url.split('_')[1]

                for tr in parsed.xpath('.//tr'):
                    td = tr.xpath('.//td')
                    if len(td) == 1 and "Datum" in td[0].text:
                        l = td[0].text.split()[1:3]
                        self.env['datetime'] = "%s %s" % (l[0], l[1])
                    elif len(td) == 2:
                        if "Wasserstand" in td[0].text:
                            self.env['levelvalue'] = td[1].text.split()[0]
                        elif "Durchfluss" in td[0].text:
                            self.env['flowvalue'] = td[1].text.split()[0]
github eskerda / pybikes / pybikes / smartbike.py View on Github external
//td[span[text() = "%s"]]/
                following-sibling::td/text()
        """

        stats_rules = {
            'std': 'Bicycles',
            'ebikes': 'Electric bicycles',
            'kids_bikes': 'Bicycles for kids'
        }

        stations = []

        for station_data in stations_data:
            latitude, longitude, name, mess = station_data
            # ??
            html_mess = html.fromstring(mess.encode('utf-8').decode('unicode_escape'))
            stats = {}
            extra = {}

            for k, rule in stats_rules.items():
                stats[k] = list(map(int, html_mess.xpath(stats_query % rule)))

            bikes = 0
            free = None

            if stats.get('std'):
                bikes += stats['std'][0]
                # std free already accounts for all slots
                free = stats['std'][1]

            if stats.get('ebikes'):
                extra['has_ebikes'] = True
github alephdata / memorious / funes / modules / retain.py View on Github external
def _drop_paths(context, res):
    with open(res.get(), 'r') as fh:
        doc = html.fromstring(fh.read())
    if res is not None and 'html' in res.content_type:
        check_path = context.params.get('check_path')
        if check_path is not None:
            if doc.find(check_path) is None:
                context.log.info("[Failed XML path check]: %r", res.url)
                return None

        for path in context.params.get('remove_paths', []):
            for el in doc.findall(path):
                el.drop_tree()
        foreign_id = res.foreign_id
        if foreign_id is None:
            foreign_id = res.url
        foreign_id = foreign_id + '-paths-removed'
        content = html.tostring(doc)
        res = HTTPResult.from_res(res, content=content, foreign_id=foreign_id)
github alvarob96 / investpy / investpy / stocks.py View on Github external
url = "https://www.investing.com/equities/" + tag

    head = {
        "User-Agent": get_random(),
        "X-Requested-With": "XMLHttpRequest",
        "Accept": "text/html",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
    }

    req = requests.get(url, headers=head)

    if req.status_code != 200:
        raise ConnectionError("ERR#0015: error " + str(req.status_code) + ", try again later.")

    root_ = fromstring(req.text)
    path_ = root_.xpath("//div[contains(@class, 'overviewDataTable')]/div")

    result = pd.DataFrame(columns=['Stock Symbol', 'Prev. Close', 'Todays Range', 'Revenue', 'Open', '52 wk Range',
                                   'EPS', 'Volume', 'Market Cap', 'Dividend (Yield)', 'Average Vol. (3m)', 'P/E Ratio',
                                   'Beta', '1-Year Change', 'Shares Outstanding', 'Next Earnings Date'])
    result.at[0, 'Stock Symbol'] = stock

    if path_:
        for elements_ in path_:
            element = elements_.xpath(".//span[@class='float_lang_base_1']")[0]
            title_ = element.text_content()
            if title_ == "Day's Range":
                title_ = 'Todays Range'
            if title_ in result.columns.tolist():
                try:
                    result.at[0, title_] = float(element.getnext().text_content().replace(',', ''))
github ozzieperez / packtpub-library-downloader / downloader.py View on Github external
account_page = session.get("https://www.packtpub.com/account", verify=True, headers=headers)
    accountpage_tree = html.fromstring(account_page.content)

    # login successful?
    if "Register" in accountpage_tree.xpath("//title/text()")[0]: # redirects to the 'Register' page if login fails
        print("Invalid login.")

    # we're in, start downloading
    else:
        print("Logged in successfully!")

        if book_assets:

            # get the list of books
            books_page = session.get("https://www.packtpub.com/account/my-ebooks", verify=True, headers=headers)
            books_tree = html.fromstring(books_page.content)
            book_nodes = books_tree.xpath("//div[@id='product-account-list']/div[contains(@class,'product-line unseen')]")

            print('###########################################################################')
            print("FOUND {0} BOOKS: STARTING DOWNLOADS".format(len(book_nodes)))
            print('###########################################################################')

            # loop through the books
            for book in book_nodes:

                # download the book
                books_directory = os.path.join(root_directory, "books")
                download_book(book, books_directory, book_assets, session, headers)

        if video_assets:

            # get the list of videos
github h3llrais3r / Auto-Subliminal / lib / lxml / html / clean.py View on Github external
def autolink_html(html, *args, **kw):
    result_type = type(html)
    if isinstance(html, basestring):
        doc = fromstring(html)
    else:
        doc = copy.deepcopy(html)
    autolink(doc, *args, **kw)
    return _transform_result(result_type, doc)
github RTXteam / RTX / code / reasoningtool / kg-construction / QueryMiRBase.py View on Github external
def convert_mirbase_id_to_mir_gene_symbol(mirbase_id):
        assert mirbase_id != ''
        res = QueryMiRBase.send_query_get('mirna_entry.pl', 'acc=' + mirbase_id)
        ret_ids = set()
        res_tree = lxml.html.fromstring(res.content)
        res_list = res_tree.xpath("//a[contains(text(), 'HGNC:')]")
        res_symbol = None
        if len(res_list) > 0:
            res_symbol = res_list[0].text.replace('HGNC:','')
        return res_symbol