How to use the hepcrawl.spiders.edp_spider.EDPSpider function in hepcrawl

To help you get started, we’ve selected a few hepcrawl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github inspirehep / hepcrawl / tests / unit / test_edp.py View on Github external
def test_author_with_email():
    """Test getting author email. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink">
        
            
            
            SnameFnameFname.Sname@university.orga
            
            
        
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
github inspirehep / hepcrawl / tests / unit / test_edp.py View on Github external
def package_jats(targzfile):
    """Extract tar.gz package with JATS XML file."""
    spider = edp_spider.EDPSpider()
    response = fake_response_from_string(text="", url="file://" + targzfile)
    return next(spider.handle_package_file(response))
github inspirehep / hepcrawl / tests / unit / test_edp.py View on Github external
def record_jats(package_jats, scrape_pos_page_body):
    """Return results from the EDP spider with JATS format.

    This is an open access journal, so we can scrape the splash page.
    """
    spider = edp_spider.EDPSpider()
    xml_path = package_jats.url.replace("file://", "")
    fake_resp = fake_response_from_file(xml_path)
    node = get_node(spider, "//article", fake_resp)[0]
    request = spider.parse_node(fake_resp, node)
    response = HtmlResponse(
        url=request.url,
        request=request,
        body=scrape_pos_page_body,
        **{'encoding': 'utf-8'}
    )

    parsed_item = request.callback(response)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
github inspirehep / hepcrawl / tests / unit / test_edp.py View on Github external
def test_no_dois_jats():
    """Test parsing when no DOI in record. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink">
        
            
            aa14485-102010A%26A...516A..97N
                
                    Dielectronic recombination of argon-like ions
                
            
        
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]

    parsed_item = spider.parse_node(response, node)
github inspirehep / hepcrawl / tests / unit / test_edp.py View on Github external
def test_addendum_jats():
    """Test parsing when article type is addendum. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink">
        
            
            aa14485-102010A%26A...516A..97N
                
                    Dielectronic recombination of argon-like ions
                
                
                
            
        
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]
github inspirehep / hepcrawl / tests / unit / test_edp.py View on Github external
def test_tarfile(tarbzfile, tmpdir):
    """Test untarring a tar.bz package with a test XML file.

    Also test directory structure flattening.
    """
    spider = edp_spider.EDPSpider()
    xml_files = spider.untar_files(tarbzfile, six.text_type(tmpdir))
    xml_files_flat = spider.untar_files(
        tarbzfile, six.text_type(tmpdir), flatten=True)

    assert len(xml_files) == 1
    assert "aas/xml_rich/2000/01/ds1691.xml" in xml_files[0]
    assert "ds1691.xml" in xml_files_flat[0]
    assert "aas/xml_rich/2000/01" not in xml_files_flat[0]
github inspirehep / hepcrawl / tests / unit / test_edp.py View on Github external
def record_rich(package_rich):
    """Return results from the EDP spider with 'rich' format.

    This is not an open access journal, so no splash scraping.
    """
    spider = edp_spider.EDPSpider()
    xml_path = package_rich.url.replace("file://", "")
    fake_resp = fake_response_from_file(xml_path)
    fake_resp.meta["rich"] = True
    node = get_node(spider, "//EDPSArticle", fake_resp)[0]

    parsed_item = spider.parse_node(fake_resp, node)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
github inspirehep / hepcrawl / tests / unit / test_edp.py View on Github external
def test_handle_package_ftp(tarbzfile):
    """Test getting the target folder name for xml files."""
    spider = edp_spider.EDPSpider()
    response = fake_response_from_string(text=tarbzfile)
    request = next(spider.handle_package_ftp(response))

    assert isinstance(request, Request)
    assert request.meta["source_folder"] == tarbzfile
github inspirehep / hepcrawl / hepcrawl / spiders / edp_spider.py View on Github external
def __init__(self, package_path=None, ftp_folder="incoming", ftp_netrc=None, *args, **kwargs):
        """Construct EDP spider.

        :param package_path: path to local tar.gz or tar.bz2 package.
        :param ftp_folder: path on remote ftp server.
        :param ftp_netrc: path to netrc file.
        """
        super(EDPSpider, self).__init__(*args, **kwargs)
        self.ftp_folder = ftp_folder
        self.ftp_host = "ftp.edpsciences.org"
        self.ftp_netrc = ftp_netrc
        self.target_folder = mkdtemp(prefix='EDP_', dir='/tmp/')
        self.package_path = package_path
        if not os.path.exists(self.target_folder):
            os.makedirs(self.target_folder)