How to use the hepcrawl.utils.get_node function in hepcrawl

To help you get started, we’ve selected a few hepcrawl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github inspirehep / hepcrawl / tests / unit / test_utils.py View on Github external
"""
    namespaces = [
        ("OAI-PMH", "http://www.openarchives.org/OAI/2.0/"),
        ("slim", "http://www.loc.gov/MARC21/slim"),
    ]
    node = get_node(text=body, namespaces=namespaces)
    record = node.xpath("//slim:record/slim:datafield/text()").extract_first()

    assert node
    assert record == "This is the record."
github inspirehep / hepcrawl / hepcrawl / spiders / edp_spider.py View on Github external
def build_item_rich(self, response):
        """Build the final HEPRecord with "rich" format XML."""
        node = get_node(response.meta["record"])
        article_type = response.meta.get("article_type")
        record = HEPLoader(item=HEPRecord(), selector=node, response=response)

        record.add_dois(dois_values=response.meta.get("dois"))
        record.add_xpath('abstract', './/Abstract')
        record.add_xpath('title', './/ArticleTitle/Title')
        record.add_xpath('subtitle', './/ArticleTitle/Subtitle')
        record.add_value('authors', self._get_authors_rich(node))
        record.add_xpath('free_keywords', './/Subject/Keyword/text()')

        record.add_value('journal_title', response.meta['journal_title'])
        record.add_xpath('journal_issue', './/Issue/text()')
        record.add_xpath('journal_volume', './/Volume/text()')
        fpage = node.xpath('.//FirstPage/text()').extract_first()
        lpage = node.xpath('.//LastPage/text()').extract_first()
        record.add_value('journal_fpage', fpage)
github inspirehep / hepcrawl / hepcrawl / parsers / arxiv.py View on Github external
def get_root_node(arxiv_record):
        """Get a selector on the root ``article`` node of the record.

        This can be overridden in case some preprocessing needs to be done on
        the XML.

        Args:
            arxiv_record(Union[str, scrapy.selector.Selector]): the record in arXiv format.

        Returns:
            scrapy.selector.Selector: a selector on the root ``<article>``
                node.
        """
        if isinstance(arxiv_record, six.string_types):
            root = get_node(arxiv_record)
        else:
            root = arxiv_record
        root.remove_namespaces()

        return root
</article>
github inspirehep / hepcrawl / hepcrawl / spiders / dnb_spider.py View on Github external
def build_item(self, response):
        """Build the final record."""
        node = get_node(response.meta["record"], self.namespaces)
        record = HEPLoader(item=HEPRecord(), selector=node, response=response)

        record.add_value('authors', self.get_authors(node))
        record.add_xpath('title',
                         "./slim:datafield[@tag='245']/slim:subfield[@code='a']/text()")
        record.add_xpath('source',
                         "./slim:datafield[@tag='264']/slim:subfield[@code='b']/text()")
        record.add_xpath('date_published',
                         "./slim:datafield[@tag='264']/slim:subfield[@code='c']/text()")
        record.add_value('thesis_supervisor',
                         self.get_thesis_supervisors(node))
        record.add_xpath(
            'language', "./slim:datafield[@tag='041']/slim:subfield[@code='a']/text()")
        record.add_value('urls', response.meta.get('urls'))
        record.add_value('file_urls', response.meta.get("direct_links"))
        record.add_value('abstract', response.meta.get("abstract"))
github inspirehep / hepcrawl / hepcrawl / spiders / edp_spider.py View on Github external
def build_item_jats(self, response):
        """Build the final HEPRecord with JATS-format XML ('jp')."""
        node = get_node(response.meta["record"])
        article_type = response.meta.get("article_type")

        record = HEPLoader(item=HEPRecord(), selector=node, response=response)
        if article_type in ['correction',
                            'addendum']:
            record.add_xpath('related_article_doi',
                             './/related-article[@ext-link-type="doi"]/@href')
            record.add_value('journal_doctype', article_type)

        record.add_dois(dois_values=response.meta.get("dois"))
        record.add_xpath('page_nr', ".//counts/page-count/@count")
        record.add_xpath('abstract', './/abstract[1]')
        record.add_xpath('title', './/article-title/text()')
        record.add_xpath('subtitle', './/subtitle/text()')
        record.add_value('authors', self._get_authors_jats(node))
        record.add_xpath('collaborations', ".//contrib/collab/text()")
github inspirehep / hepcrawl / hepcrawl / parsers / jats.py View on Github external
def get_root_node(jats_record):
        """Get a selector on the root ``article`` node of the record.

        This can be overridden in case some preprocessing needs to be done on
        the XML.

        Args:
            jats_record(Union[str, scrapy.selector.Selector]): the record in JATS format.

        Returns:
            scrapy.selector.Selector: a selector on the root ``<article>``
                node.
        """
        if isinstance(jats_record, six.string_types):
            root = get_node(jats_record)
        else:
            root = jats_record
        root.remove_namespaces()

        return root
</article>