How to use the hepcrawl.loaders.HEPLoader function in hepcrawl

To help you get started, we’ve selected a few hepcrawl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github inspirehep / hepcrawl / hepcrawl / spiders / hindawi_spider.py View on Github external
def parse_node(self, response, node):
        """Iterate all the record nodes in the XML and build the ``HEPRecord``."""

        node.remove_namespaces()
        record = HEPLoader(item=HEPRecord(), selector=node, response=response)

        record.add_value('authors', self.get_authors(node))
        record.add_xpath('abstract', "./datafield[@tag='520']/subfield[@code='a']/text()")
        record.add_xpath('title',
                         "./datafield[@tag='245']/subfield[@code='a']/text()")
        record.add_xpath('date_published',
                         "./datafield[@tag='260']/subfield[@code='c']/text()")
        record.add_xpath('page_nr',
                         "./datafield[@tag='300']/subfield[@code='a']/text()")
        dois = node.xpath(
            "./datafield[@tag='024'][subfield[@code='2'][contains(text(), 'DOI')]]"
            "/subfield[@code='a']/text()"
        ).extract()
        record.add_dois(dois_values=dois)
        record.add_xpath('journal_title',
                         "./datafield[@tag='773']/subfield[@code='p']/text()")
github inspirehep / hepcrawl / hepcrawl / spiders / brown_spider.py View on Github external
def build_item(self, response):
        """Build the final record."""
        jsonrecord = response.meta.get('jsonrecord')
        record = HEPLoader(
            item=HEPRecord(), selector=jsonrecord, response=response)

        record.add_value('title', jsonrecord.get('primary_title'))
        record.add_value('abstract', jsonrecord.get('abstract'))
        record.add_value('free_keywords', jsonrecord.get('keyword'))
        record.add_value('page_nr', response.meta.get("pages"))
        record.add_value('authors', response.meta.get("authors"))
        record.add_value('file_urls', response.meta.get("pdf_link"))
        record.add_value('urls', jsonrecord.get('uri'))
        record.add_value('date_published', response.meta.get("date"))
        record.add_value('thesis', response.meta.get("thesis"))
        record.add_value('collections', ['HEP', 'THESIS'])

        parsed_item = ParsedItem(
            record=record.load_item(),
            record_format='hepcrawl',
github inspirehep / hepcrawl / hepcrawl / spiders / pos_spider.py View on Github external
def build_conference_proceedings_item(
        self,
        proceedings_page_html,
        pos_id,
    ):
        selector = Selector(
            text=proceedings_page_html,
            type='html',
        )
        selector.remove_namespaces()
        record = HEPLoader(
            item=HEPRecord(),
            selector=selector
        )

        record.add_value('collections', ['proceedings'])
        record.add_value(
            'title',
            self._get_proceedings_title(selector=selector),
        )
        record.add_value(
            'subtitle',
            self._get_proceedings_date_place(selector=selector),
        )
        record.add_value('journal_title', 'PoS')
        record.add_value(
            'journal_volume',
github inspirehep / hepcrawl / hepcrawl / spiders / aps_spider.py View on Github external
def _parse_json_on_failure(self, failure):
        """Parse a JSON article entry."""
        original_response = failure.request.meta['original_response']
        record = HEPLoader(item=HEPRecord(), response=original_response)
        article = failure.request.meta['json_article']

        doi = get_value(article, 'identifiers.doi', default='')
        record.add_dois(dois_values=[doi])
        if article.get('numPages', -1) > 0:
            record.add_value('page_nr', str(article.get('numPages', '')))

        record.add_value('abstract', get_value(article, 'abstract.value', default=''))
        record.add_value('title', get_value(article, 'title.value', default=''))
        # record.add_value('subtitle', '')

        authors, collaborations = self._get_authors_and_collab(article)
        record.add_value('authors', authors)
        record.add_value('collaborations', collaborations)

        # record.add_value('free_keywords', free_keywords)
github inspirehep / hepcrawl / hepcrawl / spiders / mit_spider.py View on Github external
def build_item(self, response):
        """Scrape MIT full metadata page and build the final HEPRecord item."""
        node = response.selector
        record = HEPLoader(item=HEPRecord(), selector=node, response=response)
        doc_type = node.xpath(
            "//td[contains(text(), 'dc.description.degree')]/following-sibling::td[1]/text()").extract_first()
        if doc_type and "ph" not in doc_type.lower():
            return None

        pdf_files = node.xpath(".//table[@id='file-table']//td/a/@href").extract()
        if pdf_files:
            record.add_value(
                'documents',
                self.add_file(pdf_files, "HIDDEN", "Fulltext"),
            )
        record.add_value('authors', self.get_authors(node))
        record.add_xpath('date_published',
                         "//td[contains(text(), 'dc.date.issued')]/following-sibling::td[1]/text()")
        record.add_value('thesis', self.get_thesis_info(node))
        record.add_value('thesis_supervisor',
github inspirehep / hepcrawl / hepcrawl / spiders / infn_spider.py View on Github external
def build_item(self, response):
        """Build the final HEPRecord item."""
        node = response.selector
        record = HEPLoader(item=HEPRecord(), selector=node, response=response)

        pdf_files = response.meta.get("pdf_links")
        if pdf_files:
            record.add_value(
                'documents',
                self.add_file(pdf_files, "HIDDEN", "Fulltext"),
            )
        record.add_value('authors', response.meta.get("authors"))
        record.add_value('date_published', response.meta.get("date_published"))
        record.add_value('thesis', response.meta.get("thesis_info"))
        record.add_value('thesis_supervisor', response.meta.get("supervisors"))
        record.add_value('title', response.meta.get("titles"))
        record.add_value('urls', response.meta.get("splash_link"))
        record.add_value('abstract', response.meta.get("abstract"))
        record.add_value('source', 'INFN')
        record.add_value('collections', ['HEP', 'THESIS'])
github inspirehep / hepcrawl / hepcrawl / spiders / phil_spider.py View on Github external
def build_item(self, response):
        """Build the final record."""
        jsonrecord = response.meta.get('jsonrecord')
        record = HEPLoader(
            item=HEPRecord(), selector=jsonrecord, response=response)

        record.add_value('title', jsonrecord['title'])
        record.add_value('abstract', jsonrecord['abstract'])
        record.add_dois(dois_values=jsonrecord['doi'])
        record.add_value('page_nr', jsonrecord['pages'])
        record.add_value('authors', self.get_authors(jsonrecord['authors']))
        record.add_value('file_urls', response.meta.get("direct_links"))
        record.add_value('urls', jsonrecord['links'])
        record.add_value('source', "Philpapers.org")
        if not jsonrecord.get('year') == "forthcoming":
            record.add_value('date_published', self.get_date(jsonrecord))
        type_thesis = "thesis" in jsonrecord.get('pub_type').lower()
        info_diss = "dissertation" in jsonrecord.get('pubInfo').lower()
        if type_thesis or info_diss:
            record.add_value('collections', ['THESIS'])
github inspirehep / hepcrawl / hepcrawl / spiders / base_spider.py View on Github external
def build_item(self, response):
        """Build the final record."""
        node = get_node(response.meta["record"], self.namespaces)
        record = HEPLoader(item=HEPRecord(), selector=node, response=response)
        record.add_value('file_urls', response.meta.get("direct_link"))
        record.add_value('urls', response.meta.get("urls"))
        record.add_xpath('abstract', './/dc:description/text()')
        title, subtitle = self.get_title(node)
        if title:
            record.add_value('title', title)
        if subtitle:
            record.add_value('subtitle', subtitle)
        record.add_xpath('date_published', './/dc:date/text()')
        record.add_xpath('source', './/base_dc:collname/text()')
        record.add_value("authors", self.get_authors(node))
        record.add_value('thesis', {'degree_type': 'PhD'})
        record.add_value('collections', ['HEP', 'THESIS'])

        parsed_item = ParsedItem(
            record=record.load_item(),
github inspirehep / hepcrawl / hepcrawl / spiders / dnb_spider.py View on Github external
def build_item(self, response):
        """Build the final record."""
        node = get_node(response.meta["record"], self.namespaces)
        record = HEPLoader(item=HEPRecord(), selector=node, response=response)

        record.add_value('authors', self.get_authors(node))
        record.add_xpath('title',
                         "./slim:datafield[@tag='245']/slim:subfield[@code='a']/text()")
        record.add_xpath('source',
                         "./slim:datafield[@tag='264']/slim:subfield[@code='b']/text()")
        record.add_xpath('date_published',
                         "./slim:datafield[@tag='264']/slim:subfield[@code='c']/text()")
        record.add_value('thesis_supervisor',
                         self.get_thesis_supervisors(node))
        record.add_xpath(
            'language', "./slim:datafield[@tag='041']/slim:subfield[@code='a']/text()")
        record.add_value('urls', response.meta.get('urls'))
        record.add_value('file_urls', response.meta.get("direct_links"))
        record.add_value('abstract', response.meta.get("abstract"))
        record.add_value('page_nr', response.meta.get("page_nr"))