How to use the hepcrawl.pipelines.InspireCeleryPushPipeline function in hepcrawl

To help you get started, we’ve selected a few hepcrawl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github inspirehep / hepcrawl / tests / unit / test_crossref.py View on Github external
"""
    def _get_record_from_processed_item(item, spider):
        crawl_result = pipeline.process_item(item, spider)
        validate(crawl_result['record'], 'hep')
        assert crawl_result
        return crawl_result['record']

    crawler = Crawler(spidercls=crossref_spider.CrossrefSpider)
    spider = crossref_spider.CrossrefSpider.from_crawler(crawler, 'fakedoi')
    fake_response = fake_response_from_file(
        'crossref/sample_crossref_record.json',
        response_type=TextResponse,
    )
    parsed_items = spider.parse(fake_response)

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield _get_record_from_processed_item(parsed_items, spider)

    clean_dir()
github inspirehep / hepcrawl / tests / unit / test_desy.py View on Github external
def get_records(response_file_name):
    """Return all results generator from the ``Desy`` spider via pipelines."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    spider = create_spider()
    records = spider.parse(
        fake_response_from_file(
            file_name='desy/' + response_file_name,
            response_type=TextResponse
        )
    )

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    return (
        pipeline.process_item(
            record,
            spider
        )['record'] for record in records
    )
github inspirehep / hepcrawl / tests / unit / test_pos.py View on Github external
crawler = Crawler(spidercls=pos_spider.POSSpider)
    spider = pos_spider.POSSpider.from_crawler(crawler)
    request = next(spider.parse(
        fake_response_from_file(
            file_name=str('pos/sample_pos_record.xml'),
        )
    ))
    response = HtmlResponse(
        url=request.url,
        request=request,
        body=scrape_pos_conference_paper_page_body,
        **{'encoding': 'utf-8'}
    )
    assert response

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    parsed_item = next(request.callback(response))
    crawl_result = pipeline.process_item(parsed_item, spider)
    assert crawl_result['record']

    yield crawl_result['record']

    clean_dir()
github inspirehep / hepcrawl / tests / unit / test_arxiv_single.py View on Github external
def _get_record_from_processed_item(item, spider):
        crawl_result = pipeline.process_item(item, spider)
        validate(crawl_result['record'], 'hep')
        assert crawl_result
        return crawl_result['record']

    crawler = Crawler(spidercls=arxiv_spider.ArxivSpider)
    spider = arxiv_spider.ArxivSpider.from_crawler(crawler)
    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record0.xml',
        response_type=TextResponse,
    )
    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [_get_record_from_processed_item(parsed_item, spider) for parsed_item in parsed_items]

    clean_dir()
github inspirehep / hepcrawl / tests / unit / test_world_scientific.py View on Github external
def get_records(response_file_name):
    """Return all results generator from the WSP spider via pipelines."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    spider = create_spider()
    records = spider.parse(
        fake_response_from_file(
            file_name=response_file_name,
            response_type=TextResponse
        )
    )

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    return (
        pipeline.process_item(record, spider)['record']
        for record in records
    )
github inspirehep / hepcrawl / tests / unit / test_arxiv_all.py View on Github external
def many_results(spider):
    """Return results generator from the arxiv spider. Tricky fields, many
    records.
    """
    def _get_processed_record(item, spider):
        crawl_result = pipeline.process_item(item, spider)
        return crawl_result['record']

    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record.xml',
        response_type=TextResponse,
    )

    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [
        _get_processed_record(parsed_item, spider)
        for parsed_item in parsed_items
    ]

    clean_dir()
github inspirehep / hepcrawl / hepcrawl / pipelines.py View on Github external
def __init__(self):
        from celery import Celery

        super(InspireCeleryPushPipeline, self).__init__()
        self.celery = Celery()