Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def record():
"""Return scraping results from the INFN spider."""
spider = infn_spider.InfnSpider()
response = fake_response_from_file('infn/test_splash.html')
parsed_item = spider.scrape_splash(response)
assert parsed_item
assert parsed_item.record
return parsed_item.record
def get_records(response_file_name):
"""Return all results generator from the WSP spider via pipelines."""
# environmental variables needed for the pipelines payload
os.environ['SCRAPY_JOB'] = 'scrapy_job'
os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'
spider = create_spider()
records = spider.parse(
fake_response_from_file(
file_name=response_file_name,
response_type=TextResponse
)
)
pipeline = InspireCeleryPushPipeline()
pipeline.open_spider(spider)
return (
pipeline.process_item(record, spider)['record']
for record in records
)
def get_records(response_file_name):
"""Return all results generator from the ``Desy`` spider via pipelines."""
# environmental variables needed for the pipelines payload
os.environ['SCRAPY_JOB'] = 'scrapy_job'
os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'
spider = create_spider()
records = spider.parse(
fake_response_from_file(
file_name='desy/' + response_file_name,
response_type=TextResponse
)
)
pipeline = InspireCeleryPushPipeline()
pipeline.open_spider(spider)
return (
pipeline.process_item(
record,
spider
)['record'] for record in records
)
def record():
"""Return results generator from the Phil spider.
Thesis specific.
"""
spider = phil_spider.PhilSpider()
response = fake_response_from_file('phil/test_thesis.json')
jsonrecord = json.loads(response.body_as_unicode())
response.meta["jsonrecord"] = jsonrecord[0]
response.meta["direct_links"] = [
"http://philpapers.org/go.pl?id=BROBB&proxyId=none&u=http%3A%2F%2Fanalysis.oxfordjournals.org%2Fcontent%2F66%2F3%2F194.full.pdf%2Bhtml%3Fframe%3Dsidebar",
"http://philpapers.org/go.pl?id=BROBB&proxyId=none&u=http%3A%2F%2Fbrogaardb.googlepages.com%2Ftensedrelationsoffprint.pdf"
]
parsed_item = spider.build_item(response)
assert parsed_item
assert parsed_item.record
return parsed_item.record
def record():
"""Return results generator from the WSP spider."""
spider = iop_spider.IOPSpider()
response = fake_response_from_file('iop/xml/test_standard.xml')
node = get_node(spider, "Article", response)
spider.pdf_files = TEST_PDF_DIR
parsed_item = spider.parse_node(response, node)
assert parsed_item
assert parsed_item.record
return parsed_item.record
def results_from_json():
"""Return results by parsing a JSON file."""
from scrapy.http import TextResponse
crawler = Crawler(spidercls=aps_spider.APSSpider)
spider = aps_spider.APSSpider.from_crawler(crawler)
parsed_items = list(
spider.parse(
fake_response_from_file(
'aps/aps_single_response.json',
response_type=TextResponse,
)
)
)
class MockFailure:
"""Mock twisted.python.failure.Failure, failure on JATS request."""
def __init__(self):
self.request = parsed_items[0]
records = [spider._parse_json_on_failure(MockFailure()).record]
assert records
return records
}
)
mock.head(
'http://d-nb.info/1079912991/34',
headers={
'Content-Type': 'application/pdf;charset=base64',
}
)
mock.head(
'http://publikationen.ub.uni-frankfurt.de/frontdoor/index/index/docId/38625',
headers={
'Content-Type': 'text/html',
}
)
request = spider.parse(
fake_response_from_file('dnb/test_1.xml')
).next()
response = HtmlResponse(
url=request.url,
request=request,
body=scrape_pos_page_body,
**{'encoding': 'utf-8'}
)
parsed_item = request.callback(response)
assert parsed_item
assert parsed_item.record
return parsed_item.record
def record_rich(package_rich):
"""Return results from the EDP spider with 'rich' format.
This is not an open access journal, so no splash scraping.
"""
spider = edp_spider.EDPSpider()
xml_path = package_rich.url.replace("file://", "")
fake_resp = fake_response_from_file(xml_path)
fake_resp.meta["rich"] = True
node = get_node(spider, "//EDPSArticle", fake_resp)[0]
parsed_item = spider.parse_node(fake_resp, node)
assert parsed_item
assert parsed_item.record
return parsed_item.record
def record():
"""Return results generator from the crossref spider. All fields, one record.
"""
def _get_record_from_processed_item(item, spider):
crawl_result = pipeline.process_item(item, spider)
validate(crawl_result['record'], 'hep')
assert crawl_result
return crawl_result['record']
crawler = Crawler(spidercls=crossref_spider.CrossrefSpider)
spider = crossref_spider.CrossrefSpider.from_crawler(crawler, 'fakedoi')
fake_response = fake_response_from_file(
'crossref/sample_crossref_record.json',
response_type=TextResponse,
)
parsed_items = spider.parse(fake_response)
pipeline = InspireCeleryPushPipeline()
pipeline.open_spider(spider)
yield _get_record_from_processed_item(parsed_items, spider)
clean_dir()
def record():
"""Return built HEPRecord from the BASE spider."""
spider = base_spider.BaseSpider()
response = fake_response_from_file('base/test_1.xml')
selector = Selector(response, type='xml')
spider._register_namespaces(selector)
nodes = selector.xpath('.//%s' % spider.itertag)
response.meta["record"] = nodes[0].extract()
response.meta["urls"] = ["http://hdl.handle.net/1885/10005"]
parsed_item = spider.build_item(response)
assert parsed_item
assert parsed_item.record
return parsed_item.record