Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from urlparse import urljoin
from scrapy import Request
from scrapy.spiders import XMLFeedSpider
from . import StatefulSpider
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import (
split_fullname,
ParsedItem,
strict_kwargs,
)
class MagicSpider(StatefulSpider, XMLFeedSpider):
"""MAGIC crawler
Scrapes theses metadata from `MAGIC telescope web page`_.
1. ``MagicSpider.parse_node`` will get thesis title, author and date from the listing.
2. If link to the splash page exists, ``MagicSpider.scrape_for_pdf`` will try to fetch
the pdf link, abstract, and authors.
3. ``MagicSpider.build_item`` will build the ``HEPRecord``.
Examples:
::
$ scrapy crawl magic
Using source file::
from . import StatefulSpider
from ..dateutils import format_year
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import (
ParsedItem,
get_first,
get_licenses,
has_numbers,
range_as_string,
strict_kwargs,
unzip_xml_files,
)
class ElsevierSpider(StatefulSpider, XMLFeedSpider):
"""Elsevier crawler.
This spider can scrape either an ATOM feed (default), zip file
or an extracted XML.
1. Default input is the feed xml file. For every url to a zip package there
it will yield a request to unzip them. Then for every record in
the zip files it will yield a request to scrape them. You can also run
this spider on a zip file or a single record file.
2. If needed, it will try to scrape Sciencedirect web page.
3. HEPRecord will be built.
Examples:
Using ``atom_feed``::
$ scrapy crawl elsevier -a atom_feed=file://`pwd`/tests/responses/elsevier/test_feed.xml -s "JSON_OUTPUT_DIR=tmp/"
from __future__ import absolute_import, division, print_function
from scrapy import Request
from scrapy.spiders import XMLFeedSpider
from . import StatefulSpider
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import (
get_licenses,
ParsedItem,
strict_kwargs,
)
class HindawiSpider(StatefulSpider, XMLFeedSpider):
"""Hindawi crawler
* `OAI interface`_
* `Example Hindawi record`_
Sets to use:
HINDAWI.AA (Advances in Astronomy)
HINDAWI.AHEP (Advances in High Energy Physics)
HINDAWI.AMP (Advances in Mathematical Physics)
HINDAWI.JAS (Journal of Astrophysics)
HINDAWI.JCMP (Journal of Computational Methods in Physics)
HINDAWI.JGRAV (Journal of Gravity)
Scrapes Hindawi metadata XML files one at a time.
The actual files should be retrieved from Hindawi via its OAI interface.
from errno import EEXIST as FILE_EXISTS, ENOENT as NO_SUCH_FILE_OR_DIR
from os import path, makedirs
from .. import StatefulSpider
LOGGER = logging.getLogger(__name__)
class NoLastRunToLoad(Exception):
"""Error raised when there was a problem with loading the last_runs file"""
def __init__(self, file_path, set_):
self.message = "Failed to load file at {} for set {}"\
.format(file_path, set_)
class LastRunStoreSpider(StatefulSpider):
"""Takes care of storing information about spiders' last run."""
__metaclass__ = abc.ABCMeta
from_date = None
until_date = None
format = None
url = None
@abc.abstractmethod
def make_file_fingerprint(self, set_):
"""Create an identifier for last run files
Args:
set_ (string): the set being harvested
"""
raise NotImplementedError()
DEFAULT_BASE_URL = 'https://pos.sissa.it'
DEFAULT_BASE_CONFERENCE_PAPER_URL = (
DEFAULT_BASE_URL + '/contribution?id='
)
DEFAULT_BASE_PROCEEDINGS_URL = (
DEFAULT_BASE_URL + '/cgi-bin/reader/conf.cgi?confid='
)
class PoSExtractionException(Exception):
pass
class POSSpider(StatefulSpider):
"""POS/Sissa crawler.
From PoS we create two types of records, a conference paper record, and a
conference proceedings record.
The bulk of the records comes from oaiharvest, and this spider crawls the
files generated by it.
For the conference paper record we have to scrape also the html page of the
record on the PoS website to get the pdf link. (see
`DEFAULT_BASE_CONFERENCE_PAPER_URL`)
Then, from that same page, we get the internal conference id.
With that conference id, then we scrape the conference proceedings page,
and extract the information to create the proceedings record. (see
from inspire_dojson import marcxml2record
from lxml import etree
from scrapy import Request
from six.moves import urllib
from . import StatefulSpider
from ..utils import (
ParsedItem,
ftp_connection_info,
ftp_list_files,
strict_kwargs,
)
class DesySpider(StatefulSpider):
"""This spider parses files in XML MARC format (collections or single
records).
It can retrieve the files from a remote FTP or from a local directory, they
must have the extension ``.xml``.
Args:
source_folder(str): Path to the folder with the MARC files to ingest,
might be collections or single records. Will be ignored if
``ftp_host`` is passed.
ftp_folder(str): Remote folder where to look for the XML files.
ftp_host(str):
ftp_netrc(str): Path to the ``.netrc`` file with the authentication
from six.moves.urllib.parse import urlsplit
from . import StatefulSpider
from ..parsers import JatsParser
from ..utils import (
ParsedItem,
ftp_connection_info,
ftp_list_files,
local_list_files,
strict_kwargs,
unzip_xml_files,
)
class WorldScientificSpider(StatefulSpider, XMLFeedSpider):
"""World Scientific Proceedings crawler.
This spider connects to a given FTP hosts and downloads zip files with
XML files for extraction into HEP records.
This means that it generates the URLs for Scrapy to crawl in a special way:
1. First it connects to a FTP host and lists all the new ZIP files found
on the remote server and downloads them to a designated local folder,
using ``WorldScientificSpider.start_requests()``.
2. Then the ZIP file is unpacked and it lists all the XML files found
inside, via ``WorldScientificSpider.handle_package()``. Note the
callback from ``WorldScientificSpider.start_requests()``.
3. Finally, now each XML file is parsed via
``WorldScientificSpider.parse_node()``.
from scrapy import Request
from scrapy.spiders import CrawlSpider
from . import StatefulSpider
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import (
split_fullname,
parse_domain,
get_mime_type,
ParsedItem,
strict_kwargs,
)
class BrownSpider(StatefulSpider, CrawlSpider):
"""Brown crawler
Scrapes theses metadata from `Brown Digital Repository`_ JSON file. You can browse the
dissertations `here`_.
Examples:
Using JSON output directory::
$ scrapy crawl brown -s "JSON_OUTPUT_DIR=tmp/"
Using source file and JSON output directory::
$ scrapy crawl brown -a source_file=file://`pwd`/tests/responses/brown/test_1.json -s "JSON_OUTPUT_DIR=tmp/"
Todo:
from scrapy.http import Request
from scrapy.spiders import XMLFeedSpider
from . import StatefulSpider
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import (
get_temporary_file,
split_fullname,
ParsedItem,
strict_kwargs,
)
class MITSpider(StatefulSpider, XMLFeedSpider):
"""MIT crawler
Scrapes theses metadata from `MIT DSpace (Dept. of Physics dissertations)`_.
1. ``MITSpider.get_list_file`` makes post requests to get list of records as a html
file. Defaults are to take the current year and 100 records per file.
2. ``MITSpider.parse`` iterates through every record on the html page and yields
a request to scrape full metadata.
3. ``MITSpider.build_item`` builds the final ``MITSpider.HEPRecord``.
Examples:
::
$ scrapy crawl MIT
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import (
ftp_list_files,
ftp_connection_info,
get_first,
get_journal_and_section,
get_licenses,
get_node,
parse_domain,
ParsedItem,
strict_kwargs,
)
class EDPSpider(StatefulSpider, Jats, XMLFeedSpider):
"""EDP Sciences crawler.
This spider connects to a given FTP hosts and downloads zip files with
XML files for extraction into HEP records.
This means that it generates the URLs for Scrapy to crawl in a special way:
1. First it connects to a FTP host and lists all the new TAR files found
on the remote server and downloads them to a designated local folder,
using ``EDPSpider.start_requests()``. The starting point of the crawl
can also be a local file. Packages contain XML files with different
formats (``gz`` package is ``JATS``, ``bz2`` package has ``rich`` and
``jp`` format XML files, ``jp`` is ``JATS``.)
2. Then the TAR file is unpacked and it lists all the XML files found
inside, via ``EDPSpider.handle_package()``. Note the callback from