How to use the hepcrawl.spiders.StatefulSpider function in hepcrawl

To help you get started, we’ve selected a few hepcrawl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github inspirehep / hepcrawl / hepcrawl / spiders / magic_spider.py View on Github external
from urlparse import urljoin

from scrapy import Request
from scrapy.spiders import XMLFeedSpider

from . import StatefulSpider
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import (
    split_fullname,
    ParsedItem,
    strict_kwargs,
)


class MagicSpider(StatefulSpider, XMLFeedSpider):

    """MAGIC crawler

    Scrapes theses metadata from `MAGIC telescope web page`_.

    1. ``MagicSpider.parse_node`` will get thesis title, author and date from the listing.
    2. If link to the splash page exists, ``MagicSpider.scrape_for_pdf`` will try to fetch
       the pdf link, abstract, and authors.
    3. ``MagicSpider.build_item`` will build the ``HEPRecord``.

    Examples:
        ::

            $ scrapy crawl magic

        Using source file::
github inspirehep / hepcrawl / hepcrawl / spiders / elsevier_spider.py View on Github external
from . import StatefulSpider
from ..dateutils import format_year
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import (
    ParsedItem,
    get_first,
    get_licenses,
    has_numbers,
    range_as_string,
    strict_kwargs,
    unzip_xml_files,
)


class ElsevierSpider(StatefulSpider, XMLFeedSpider):
    """Elsevier crawler.

    This spider can scrape either an ATOM feed (default), zip file
    or an extracted XML.

    1. Default input is the feed xml file. For every url to a zip package there
       it will yield a request to unzip them. Then for every record in
       the zip files it will yield a request to scrape them. You can also run
       this spider on a zip file or a single record file.
    2. If needed, it will try to scrape Sciencedirect web page.
    3. HEPRecord will be built.

    Examples:
        Using ``atom_feed``::

            $ scrapy crawl elsevier -a atom_feed=file://`pwd`/tests/responses/elsevier/test_feed.xml -s "JSON_OUTPUT_DIR=tmp/"
github inspirehep / hepcrawl / hepcrawl / spiders / hindawi_spider.py View on Github external
from __future__ import absolute_import, division, print_function

from scrapy import Request
from scrapy.spiders import XMLFeedSpider

from . import StatefulSpider
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import (
    get_licenses,
    ParsedItem,
    strict_kwargs,
)


class HindawiSpider(StatefulSpider, XMLFeedSpider):

    """Hindawi crawler

    * `OAI interface`_
    * `Example Hindawi record`_

    Sets to use:
    HINDAWI.AA (Advances in Astronomy)
    HINDAWI.AHEP (Advances in High Energy Physics)
    HINDAWI.AMP (Advances in Mathematical Physics)
    HINDAWI.JAS (Journal of Astrophysics)
    HINDAWI.JCMP (Journal of Computational Methods in Physics)
    HINDAWI.JGRAV (Journal of Gravity)

    Scrapes Hindawi metadata XML files one at a time.
    The actual files should be retrieved from Hindawi via its OAI interface.
github inspirehep / hepcrawl / hepcrawl / spiders / common / lastrunstore_spider.py View on Github external
from errno import EEXIST as FILE_EXISTS, ENOENT as NO_SUCH_FILE_OR_DIR
from os import path, makedirs

from .. import StatefulSpider

LOGGER = logging.getLogger(__name__)


class NoLastRunToLoad(Exception):
    """Error raised when there was a problem with loading the last_runs file"""
    def __init__(self, file_path, set_):
        self.message = "Failed to load file at {} for set {}"\
            .format(file_path, set_)


class LastRunStoreSpider(StatefulSpider):
    """Takes care of storing information about spiders' last run."""
    __metaclass__ = abc.ABCMeta
    from_date = None
    until_date = None
    format = None
    url = None

    @abc.abstractmethod
    def make_file_fingerprint(self, set_):
        """Create an identifier for last run files

        Args:
            set_ (string): the set being harvested
        """
        raise NotImplementedError()
github inspirehep / hepcrawl / hepcrawl / spiders / pos_spider.py View on Github external
DEFAULT_BASE_URL = 'https://pos.sissa.it'
DEFAULT_BASE_CONFERENCE_PAPER_URL = (
    DEFAULT_BASE_URL + '/contribution?id='
)
DEFAULT_BASE_PROCEEDINGS_URL = (
    DEFAULT_BASE_URL + '/cgi-bin/reader/conf.cgi?confid='
)


class PoSExtractionException(Exception):
    pass


class POSSpider(StatefulSpider):
    """POS/Sissa crawler.

    From PoS we create two types of records, a conference paper record, and a
    conference proceedings record.

    The bulk of the records comes from oaiharvest, and this spider crawls the
    files generated by it.

    For the conference paper record we have to scrape also the html page of the
    record on the PoS website to get the pdf link. (see
    `DEFAULT_BASE_CONFERENCE_PAPER_URL`)

    Then, from that same page, we get the internal conference id.

    With that conference id, then we scrape the conference proceedings page,
    and extract the information to create the proceedings record. (see
github inspirehep / hepcrawl / hepcrawl / spiders / desy_spider.py View on Github external
from inspire_dojson import marcxml2record
from lxml import etree
from scrapy import Request

from six.moves import urllib

from . import StatefulSpider
from ..utils import (
    ParsedItem,
    ftp_connection_info,
    ftp_list_files,
    strict_kwargs,
)


class DesySpider(StatefulSpider):
    """This spider parses files in XML MARC format (collections or single
    records).

    It can retrieve the files from a remote FTP or from a local directory, they
    must have the extension ``.xml``.

    Args:
        source_folder(str): Path to the folder with the MARC files to ingest,
            might be collections or single records. Will be ignored if
            ``ftp_host`` is passed.

        ftp_folder(str): Remote folder where to look for the XML files.

        ftp_host(str):

        ftp_netrc(str): Path to the ``.netrc`` file with the authentication
github inspirehep / hepcrawl / hepcrawl / spiders / wsp_spider.py View on Github external
from six.moves.urllib.parse import urlsplit

from . import StatefulSpider
from ..parsers import JatsParser
from ..utils import (
    ParsedItem,
    ftp_connection_info,
    ftp_list_files,
    local_list_files,
    strict_kwargs,
    unzip_xml_files,
)


class WorldScientificSpider(StatefulSpider, XMLFeedSpider):
    """World Scientific Proceedings crawler.

    This spider connects to a given FTP hosts and downloads zip files with
    XML files for extraction into HEP records.

    This means that it generates the URLs for Scrapy to crawl in a special way:

    1. First it connects to a FTP host and lists all the new ZIP files found
       on the remote server and downloads them to a designated local folder,
       using ``WorldScientificSpider.start_requests()``.
    2. Then the ZIP file is unpacked and it lists all the XML files found
       inside, via ``WorldScientificSpider.handle_package()``. Note the
       callback from ``WorldScientificSpider.start_requests()``.
    3. Finally, now each XML file is parsed via
       ``WorldScientificSpider.parse_node()``.
github inspirehep / hepcrawl / hepcrawl / spiders / brown_spider.py View on Github external
from scrapy import Request
from scrapy.spiders import CrawlSpider

from . import StatefulSpider
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import (
    split_fullname,
    parse_domain,
    get_mime_type,
    ParsedItem,
    strict_kwargs,
)


class BrownSpider(StatefulSpider, CrawlSpider):

    """Brown crawler

    Scrapes theses metadata from `Brown Digital Repository`_ JSON file. You can browse the
    dissertations `here`_.

    Examples:
        Using JSON output directory::

        $ scrapy crawl brown -s "JSON_OUTPUT_DIR=tmp/"

        Using source file and JSON output directory::

        $ scrapy crawl brown -a source_file=file://`pwd`/tests/responses/brown/test_1.json -s "JSON_OUTPUT_DIR=tmp/"

    Todo:
github inspirehep / hepcrawl / hepcrawl / spiders / mit_spider.py View on Github external
from scrapy.http import Request
from scrapy.spiders import XMLFeedSpider

from . import StatefulSpider
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import (
    get_temporary_file,
    split_fullname,
    ParsedItem,
    strict_kwargs,
)


class MITSpider(StatefulSpider, XMLFeedSpider):

    """MIT crawler

    Scrapes theses metadata from `MIT DSpace (Dept. of Physics dissertations)`_.

    1. ``MITSpider.get_list_file`` makes post requests to get list of records as a html
       file. Defaults are to take the current year and 100 records per file.
    2. ``MITSpider.parse`` iterates through every record on the html page and yields
       a request to scrape full metadata.
    3. ``MITSpider.build_item`` builds the final ``MITSpider.HEPRecord``.

    Examples:
        ::

            $ scrapy crawl MIT
github inspirehep / hepcrawl / hepcrawl / spiders / edp_spider.py View on Github external
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import (
    ftp_list_files,
    ftp_connection_info,
    get_first,
    get_journal_and_section,
    get_licenses,
    get_node,
    parse_domain,
    ParsedItem,
    strict_kwargs,
)


class EDPSpider(StatefulSpider, Jats, XMLFeedSpider):
    """EDP Sciences crawler.

    This spider connects to a given FTP hosts and downloads zip files with
    XML files for extraction into HEP records.

    This means that it generates the URLs for Scrapy to crawl in a special way:

    1. First it connects to a FTP host and lists all the new TAR files found
       on the remote server and downloads them to a designated local folder,
       using ``EDPSpider.start_requests()``. The starting point of the crawl
       can also be a local file. Packages contain XML files with different
       formats (``gz`` package is ``JATS``, ``bz2`` package has ``rich`` and
       ``jp`` format XML files, ``jp`` is ``JATS``.)

    2. Then the TAR file is unpacked and it lists all the XML files found
       inside, via ``EDPSpider.handle_package()``. Note the callback from