How to use the hepcrawl.items.HEPRecord function in hepcrawl

To help you get started, we’ve selected a few hepcrawl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github inspirehep / hepcrawl / tests / unit / test_base.py View on Github external
def test_parsed_node(parsed_node):
    """Test call to parse_node when there is a direct link.
    Result object should be the final HEPRecord.
    """
    assert isinstance(parsed_node, hepcrawl.items.HEPRecord)
    assert parsed_node["urls"][0]["value"] == "http://www.example.com/bitstream/1885/10005/1/Butt_R.D._2003.pdf"
github inspirehep / hepcrawl / hepcrawl / spiders / pos_spider.py View on Github external
def build_conference_paper_item(
        self,
        xml_record,
        conference_paper_url,
        conference_paper_pdf_url,
    ):
        selector = Selector(
            text=xml_record,
            type="xml"
        )
        selector.remove_namespaces()
        record = HEPLoader(
            item=HEPRecord(),
            selector=selector
        )

        license_text = selector.xpath(
            './/metadata/pex-dc/rights/text()'
        ).extract_first()
        record.add_value('license', get_licenses(license_text=license_text))

        date, year = self._get_date(selector=selector)
        record.add_value('date_published', date)
        record.add_value('journal_year', year)

        identifier = selector.xpath(
            ".//metadata/pex-dc/identifier/text()"
        ).extract_first()
        record.add_value(
github inspirehep / hepcrawl / hepcrawl / spiders / edp_spider.py View on Github external
def build_item_jats(self, response):
        """Build the final HEPRecord with JATS-format XML ('jp')."""
        node = get_node(response.meta["record"])
        article_type = response.meta.get("article_type")

        record = HEPLoader(item=HEPRecord(), selector=node, response=response)
        if article_type in ['correction',
                            'addendum']:
            record.add_xpath('related_article_doi',
                             './/related-article[@ext-link-type="doi"]/@href')
            record.add_value('journal_doctype', article_type)

        record.add_dois(dois_values=response.meta.get("dois"))
        record.add_xpath('page_nr', ".//counts/page-count/@count")
        record.add_xpath('abstract', './/abstract[1]')
        record.add_xpath('title', './/article-title/text()')
        record.add_xpath('subtitle', './/subtitle/text()')
        record.add_value('authors', self._get_authors_jats(node))
        record.add_xpath('collaborations', ".//contrib/collab/text()")

        free_keywords, classification_numbers = self._get_keywords(node)
        record.add_value('free_keywords', free_keywords)
github inspirehep / hepcrawl / hepcrawl / spiders / elsevier_spider.py View on Github external
def build_item(self, response):
        """Parse an Elsevier XML file into a HEP record."""
        node = response.meta.get("node")
        record = HEPLoader(
            item=HEPRecord(), selector=node, response=response)
        doctype = self.get_doctype(node)
        self.logger.info("Doc type is %s", doctype)
        if doctype in {'correction', 'addendum'}:
            # NOTE: should test if this is working as intended.
            record.add_xpath(
                'related_article_doi', "//related-article[@ext-link-type='doi']/@href")

        xml_file = response.meta.get("xml_url")
        if xml_file:
            record.add_value(
                'documents',
                self.add_file(xml_file, "HIDDEN", "Fulltext"),
            )
            sd_url = self._get_sd_url(xml_file)
            if requests.head(sd_url).status_code == 200:  # Test if valid url
                record.add_value("urls", sd_url)
github inspirehep / hepcrawl / hepcrawl / spiders / magic_spider.py View on Github external
def build_item(self, response):
        """Build the final HEPRecord """
        node = response.meta.get("node")
        record = HEPLoader(
            item=HEPRecord(), selector=node, response=response)

        record.add_value('authors', response.meta.get("authors"))
        record.add_value('date_published', response.meta.get("date"))
        record.add_value('thesis', {'degree_type': "PhD"})
        record.add_value('title', response.meta.get("title"))
        record.add_value('urls', response.meta.get("urls"))
        record.add_value("abstract", response.meta.get("abstract"))
        record.add_value("documents", response.meta.get("files"))
        record.add_value('collections', ['HEP', 'THESIS'])

        parsed_item = ParsedItem(
            record=record.load_item(),
            record_format='hepcrawl',
        )

        yield parsed_item
github inspirehep / hepcrawl / hepcrawl / spiders / phil_spider.py View on Github external
def build_item(self, response):
        """Build the final record."""
        jsonrecord = response.meta.get('jsonrecord')
        record = HEPLoader(
            item=HEPRecord(), selector=jsonrecord, response=response)

        record.add_value('title', jsonrecord['title'])
        record.add_value('abstract', jsonrecord['abstract'])
        record.add_dois(dois_values=jsonrecord['doi'])
        record.add_value('page_nr', jsonrecord['pages'])
        record.add_value('authors', self.get_authors(jsonrecord['authors']))
        record.add_value('file_urls', response.meta.get("direct_links"))
        record.add_value('urls', jsonrecord['links'])
        record.add_value('source', "Philpapers.org")
        if not jsonrecord.get('year') == "forthcoming":
            record.add_value('date_published', self.get_date(jsonrecord))
        type_thesis = "thesis" in jsonrecord.get('pub_type').lower()
        info_diss = "dissertation" in jsonrecord.get('pubInfo').lower()
        if type_thesis or info_diss:
            record.add_value('collections', ['THESIS'])
        elif "journal" in jsonrecord.get('pub_type').lower():