How to use Scrapy - 10 common examples

To help you get started, we’ve selected a few Scrapy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github aiqm / torchani / tools / generate-unit-test-expect / nist-dataset / nist.py View on Github external
def start_requests(self):
        start_url = urltemplate.format(min_weight, max_weight)
        yield scrapy.Request(
            url=start_url,
            callback=lambda x: self.parse_range(x, min_weight, max_weight))
github scrapedia / scrapy-cookies / tests / test_storages / test_storage_mongo.py View on Github external
def setUp(self):
        self.spider = Spider("foo")
        self.settings = Settings()
        self.settings.setmodule(default_settings)
        self.settings.setdict(self.local_settings)
        self.storage = MongoStorage(self.settings)
        self.storage.open_spider(self.spider)
github scrapy / scrapy / tests / test_pipeline_media.py View on Github external
def _assert_request_no3xx(self, pipeline_class, settings):
        pipe = pipeline_class(settings=Settings(settings))
        request = Request('http://url')
        pipe._modify_media_request(request)

        self.assertIn('handle_httpstatus_list', request.meta)
        for status, check in [
                (200, True),

                # These are the status codes we want
                # the downloader to handle itself
                (301, False),
                (302, False),
                (302, False),
                (307, False),
                (308, False),

                # we still want to get 4xx and 5xx
                (400, True),
github scrapedia / scrapy-cookies / tests / test_downloadermiddleware_cookies.py View on Github external
# embed C1 and C3 for scrapytest.org/foo
        req = Request("http://scrapytest.org/foo")
        self.mw.process_request(req, self.spider)
        assert req.headers.get("Cookie") in (
            b"C1=value1; C3=value3",
            b"C3=value3; C1=value1",
        )

        # embed C2 for scrapytest.org/bar
        req = Request("http://scrapytest.org/bar")
        self.mw.process_request(req, self.spider)
        self.assertEqual(req.headers.get("Cookie"), b"C2=value2")

        # embed nothing for scrapytest.org/baz
        req = Request("http://scrapytest.org/baz")
        self.mw.process_request(req, self.spider)
        assert "Cookie" not in req.headers
github scrapy / scrapy / tests / test_spidermiddleware_httperror.py View on Github external
def start_requests(self):
        for url in self.start_urls:
            yield Request(url, self.parse, errback=self.on_error)
github amperser / SublimeLinter-contrib-proselint / tests / corpus / newyorker / newyorker / items.py View on Github external
# -*- coding: utf-8 -*-

"""Stuff to pull from a New Yorker article."""

import scrapy


class NewYorkerItem(scrapy.Item):

    """Pull the title, author, text, and link."""

    title = scrapy.Field()
    author = scrapy.Field()
    text = scrapy.Field()
    link = scrapy.Field()
github scrapy-plugins / scrapy-pagestorage / tests / test_pagestorage.py View on Github external
def test_writer_closed_on_spider_closed_signal(self):
        self.crawler_mock.signals.connect.assert_called_once_with(
            self.instance.spider_closed,
            signal=signals.spider_closed
        )
        with mock.patch.object(self.instance, '_writer') as writer_mock:
            self.instance.spider_closed(self.spider)
        writer_mock.close.assert_called_once_with()
github inspirehep / hepcrawl / tests / unit / test_t2k.py View on Github external
def record():
    """Return results from the T2K spider."""
    spider = t2k_spider.T2kSpider()
    response = fake_response_from_file('t2k/test_1.html')
    selector = Selector(response, type='html')
    nodes = selector.xpath('//%s' % spider.itertag)
    spider.domain = "file:///tests/responses/t2k/"
    parsed_node = spider.parse_node(response, nodes[0])

    splash_response = fake_response_from_file('t2k/001.html')
    splash_response.meta["date"] = parsed_node.meta["date"]
    splash_response.meta["title"] = parsed_node.meta["title"]
    splash_response.meta["urls"] = parsed_node.meta["urls"]
    splash_response.meta["authors"] = parsed_node.meta["authors"]

    parsed_item = spider.scrape_for_pdf(splash_response).next()
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
github scrapy / scrapy / tests / test_selector.py View on Github external
def test_badly_encoded_body(self):
        # \xe9 alone isn't valid utf8 sequence
        r1 = TextResponse('http://www.example.com',
                          body=b'<p>an Jos\xe9 de</p>',
                          encoding='utf-8')
        Selector(r1).xpath('//text()').getall()
github inspirehep / hepcrawl / tests / unit / test_base.py View on Github external
def urls():
    spider = base_spider.BaseSpider()
    response = fake_response_from_file('base/test_1.xml')
    selector = Selector(response, type='xml')
    spider._register_namespaces(selector)
    nodes = selector.xpath('.//%s' % spider.itertag)
    return spider.get_urls_in_record(nodes[0])