Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""
def _get_record_from_processed_item(item, spider):
crawl_result = pipeline.process_item(item, spider)
validate(crawl_result['record'], 'hep')
assert crawl_result
return crawl_result['record']
crawler = Crawler(spidercls=crossref_spider.CrossrefSpider)
spider = crossref_spider.CrossrefSpider.from_crawler(crawler, 'fakedoi')
fake_response = fake_response_from_file(
'crossref/sample_crossref_record.json',
response_type=TextResponse,
)
parsed_items = spider.parse(fake_response)
pipeline = InspireCeleryPushPipeline()
pipeline.open_spider(spider)
yield _get_record_from_processed_item(parsed_items, spider)
clean_dir()
def get_records(response_file_name):
"""Return all results generator from the ``Desy`` spider via pipelines."""
# environmental variables needed for the pipelines payload
os.environ['SCRAPY_JOB'] = 'scrapy_job'
os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'
spider = create_spider()
records = spider.parse(
fake_response_from_file(
file_name='desy/' + response_file_name,
response_type=TextResponse
)
)
pipeline = InspireCeleryPushPipeline()
pipeline.open_spider(spider)
return (
pipeline.process_item(
record,
spider
)['record'] for record in records
)
crawler = Crawler(spidercls=pos_spider.POSSpider)
spider = pos_spider.POSSpider.from_crawler(crawler)
request = next(spider.parse(
fake_response_from_file(
file_name=str('pos/sample_pos_record.xml'),
)
))
response = HtmlResponse(
url=request.url,
request=request,
body=scrape_pos_conference_paper_page_body,
**{'encoding': 'utf-8'}
)
assert response
pipeline = InspireCeleryPushPipeline()
pipeline.open_spider(spider)
parsed_item = next(request.callback(response))
crawl_result = pipeline.process_item(parsed_item, spider)
assert crawl_result['record']
yield crawl_result['record']
clean_dir()
def _get_record_from_processed_item(item, spider):
crawl_result = pipeline.process_item(item, spider)
validate(crawl_result['record'], 'hep')
assert crawl_result
return crawl_result['record']
crawler = Crawler(spidercls=arxiv_spider.ArxivSpider)
spider = arxiv_spider.ArxivSpider.from_crawler(crawler)
fake_response = fake_response_from_file(
'arxiv/sample_arxiv_record0.xml',
response_type=TextResponse,
)
test_selectors = fake_response.xpath('.//record')
parsed_items = [spider.parse_record(sel) for sel in test_selectors]
pipeline = InspireCeleryPushPipeline()
pipeline.open_spider(spider)
yield [_get_record_from_processed_item(parsed_item, spider) for parsed_item in parsed_items]
clean_dir()
def get_records(response_file_name):
"""Return all results generator from the WSP spider via pipelines."""
# environmental variables needed for the pipelines payload
os.environ['SCRAPY_JOB'] = 'scrapy_job'
os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'
spider = create_spider()
records = spider.parse(
fake_response_from_file(
file_name=response_file_name,
response_type=TextResponse
)
)
pipeline = InspireCeleryPushPipeline()
pipeline.open_spider(spider)
return (
pipeline.process_item(record, spider)['record']
for record in records
)
def many_results(spider):
"""Return results generator from the arxiv spider. Tricky fields, many
records.
"""
def _get_processed_record(item, spider):
crawl_result = pipeline.process_item(item, spider)
return crawl_result['record']
fake_response = fake_response_from_file(
'arxiv/sample_arxiv_record.xml',
response_type=TextResponse,
)
test_selectors = fake_response.xpath('.//record')
parsed_items = [spider.parse_record(sel) for sel in test_selectors]
pipeline = InspireCeleryPushPipeline()
pipeline.open_spider(spider)
yield [
_get_processed_record(parsed_item, spider)
for parsed_item in parsed_items
]
clean_dir()
def __init__(self):
from celery import Celery
super(InspireCeleryPushPipeline, self).__init__()
self.celery = Celery()