How to use the lxml.etree.XPath function in lxml

To help you get started, we’ve selected a few lxml examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Marduke / CalimeplPacz / plugins / pitaval / __init__.py View on Github external
identifiers – A dictionary of other identifiers, most commonly {‘isbn’:‘1234...’}
            timeout – Timeout in seconds, no network request should hang for longer than timeout.
        Returns:
            None if no errors occurred, otherwise a unicode representation of the error suitable for showing to the user
        '''

        self.log = Log(self.name, log)

        found = []
        xml = None
        detail_ident = None

        #test previous found first
        ident = identifiers.get(self.name, None)

        XPath = partial(etree.XPath, namespaces=self.NAMESPACES)
        detail_test = XPath('//x:div[@id="detail"]')
        entry = XPath('//x:tr[@class="suda" or @class="licha"]')

        query = self.create_query(title=title, authors=authors,
                identifiers=identifiers)
        if not query:
            self.log('Insufficient metadata to construct query')
            return

        br = self.browser
        try:
            self.log('download book page search %s'%query)
            raw = br.open(query, timeout=timeout).read().strip()
            try:
                parser = etree.XMLParser(recover=True)
                clean = clean_ascii_chars(raw)
github quiris11 / ExtractCoverThumbs / epubQTools / lib / epubqcheck.py View on Github external
if check_hyphs:
            if not _ufound and _alltext.find(u'\u00AD') != -1:
                print(_file_dec + ': U+00AD hyphenate marks found.')
                _ufound = True
            if not _unbfound and _alltext.find(u'\u00A0') != -1:
                print(_file_dec + ': U+00A0 non-breaking space found.')
                _unbfound = True
        _links = etree.XPath('//xhtml:link', namespaces=XHTMLNS)(_xhtmlsoup)
        for _link in _links:
            if not _linkfound and (_link.get('type') is None):
                _linkfound = True
                print(_file_dec + ': At least one xhtml file has link tag '
                      'without type attribute defined')

    #Check dtb:uid - should be identical go dc:identifier
    ncxfile = etree.XPath('//opf:item[@media-type="application/x-dtbncx+xml"]',
                          namespaces=OPFNS)(opftree)[0].get('href')
    ncxtree = etree.fromstring(_epubfile.read(_folder + ncxfile))
    uniqid = etree.XPath('//opf:package',
                         namespaces=OPFNS)(opftree)[0].get('unique-identifier')
    if uniqid is not None:
        try:
            dc_identifier = etree.XPath('//dc:identifier[@id="' + uniqid +
                                        '"]/text()',
                                        namespaces=DCNS)(opftree)[0]
        except:
            dc_identifier = ''
            print(_file_dec + ': dc:identifier with unique-id not found')
    else:
        print(_file_dec + ': no unique-identifier found')
    try:
        metadtd = etree.XPath('//ncx:meta[@name="dtb:uid"]',
github mrgaaron / LinkedIn-Client-Library / liclient / parsers / lixml.py View on Github external
def __init__(self, content):
        self.tree = content
        self.xpath_collection = {
            'id': etree.XPath('id'),
            'school-name': etree.XPath('school-name'),
            'field-of-study': etree.XPath('field-of-study'),
            'start-date': etree.XPath('start-date/year'),
            'end-date': etree.XPath('end-date/year'),
            'degree': etree.XPath('degree'),
            'activities': etree.XPath('activities')
        }
        self.results = self.__build_data(self.tree)
github kovidgoyal / calibre / src / calibre / ebooks / oeb / parse_utils.py View on Github external
def XPath(expr):
    return etree.XPath(expr, namespaces={'h':XHTML_NS})
github Marduke / CalimeplPacz / plugins / kdb / worker.py View on Github external
def __init__(self, ident, result_queue, browser, log, relevance, plugin, timeout=20):
        Thread.__init__(self)
        self.daemon = True
        self.ident, self.result_queue = ident, result_queue
        self.browser = browser.clone_browser()
        self.relevance = relevance
        self.plugin, self.timeout = plugin, timeout
        self.cover_url = self.isbn = None
        self.XPath = partial(etree.XPath, namespaces=plugin.NAMESPACES)
        self.number = int(ident)
        self.log = Log("worker %i"%self.number, log)
github laurentb / weboob / tools / check_xpath.py View on Github external
def check_xpath(self, s, lineno):
        try:
            lxml.etree.XPath(s)
        except lxml.etree.XPathSyntaxError as exc:
            raise Error(self.file, lineno, exc)

        if self.warnings:
            if not s.lstrip('(').startswith('.') and len(self.element_context) >= 2:
                if self.element_context[-1] == 'ItemElement' and self.element_context[-2] in ('TableElement', 'ListElement'):
                    print('%s:%s: probable missing "." at start of XPath' % (self.file, lineno))
github quiris11 / ExtractCoverThumbs / epubQTools / epubQTools.py View on Github external
def fix_styles(source_file):
    try:
        links = etree.XPath(
            '//xhtml:link',
            namespaces=XHTMLNS
        )(source_file)
    except:
        print('No links found...')
    for link in links:
        if link.get('type') is None:
            link.set('type', 'text/css')
    return source_file
github quiris11 / ExtractCoverThumbs / epubQTools / epubQTools.py View on Github external
# set dc:language to my language
    for lang in soup.xpath("//dc:language", namespaces=DCNS):
        if lang.text != _my_language:
            lang.text = _my_language

    # add missing dc:language
    if len(soup.xpath("//dc:language", namespaces=DCNS)) == 0:
        for metadata in soup.xpath("//opf:metadata", namespaces=OPFNS):
            newlang = etree.Element(
                '{http://purl.org/dc/elements/1.1/}language'
            )
            newlang.text = _my_language
            metadata.insert(0, newlang)

    # add missing meta cover and cover reference guide element
    metacovers = etree.XPath('//opf:meta[@name="cover"]',
                             namespaces=OPFNS)(soup)
    refcovers = etree.XPath('//opf:reference[@type="cover"]',
                            namespaces=OPFNS)(soup)
    if len(metacovers) == 1 and len(refcovers) == 0:
        # set missing cover reference guide element
        itemcovers = etree.XPath(
            '//opf:item[@id="' + metacovers[0].get('content') + '"]',
            namespaces=OPFNS
        )(soup)
        if verbose:
            print('Defining cover guide element...')
        itemcoverhref = os.path.basename(itemcovers[0].get('href'))
        soup = set_cover_guide_ref(
            xhtml_files, itemcoverhref, xhtml_file_paths, soup
        )
github quiris11 / ExtractCoverThumbs / epubQTools / lib / epubqcheck.py View on Github external
namespaces=OPFNS)(opftree)[0].get('href')
    ncxtree = etree.fromstring(_epubfile.read(_folder + ncxfile))
    uniqid = etree.XPath('//opf:package',
                         namespaces=OPFNS)(opftree)[0].get('unique-identifier')
    if uniqid is not None:
        try:
            dc_identifier = etree.XPath('//dc:identifier[@id="' + uniqid +
                                        '"]/text()',
                                        namespaces=DCNS)(opftree)[0]
        except:
            dc_identifier = ''
            print(_file_dec + ': dc:identifier with unique-id not found')
    else:
        print(_file_dec + ': no unique-identifier found')
    try:
        metadtd = etree.XPath('//ncx:meta[@name="dtb:uid"]',
                              namespaces=NCXNS)(ncxtree)[0]
        if metadtd.get('content') != dc_identifier:
            print(_file_dec + ': dtd:uid and dc:identifier mismatched')
    except IndexError:
        print(_file_dec + ': dtd:uid not properly defined')

    for meta in opftree.xpath("//opf:meta[starts-with(@name, 'calibre')]",
                              namespaces=OPFNS):
        print(_file_dec + ': calibre staff found')
        break
    for dcid in opftree.xpath(
        "//dc:identifier[@opf:scheme='calibre']",
        namespaces={'dc': 'http://purl.org/dc/elements/1.1/',
                    'opf': 'http://www.idpf.org/2007/opf'}
    ):
        print(_file_dec + ': calibre staff found')
github Marduke / CalimeplPacz / plugins / pitaval / worker.py View on Github external
def __init__(self, ident, result_queue, browser, log, relevance, plugin, xml, timeout=20):
        Thread.__init__(self)
        self.daemon = True
        self.ident, self.result_queue = ident, result_queue
        self.browser = browser.clone_browser()
        self.relevance = relevance
        self.plugin, self.timeout = plugin, timeout
        self.cover_url = self.isbn = None
        self.XPath = partial(etree.XPath, namespaces=plugin.NAMESPACES)
        self.xml = xml
        self.log = Log("worker %s"%ident, log)