How to use the scrapy.Spider function in Scrapy

To help you get started, we’ve selected a few Scrapy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scrapedia / scrapy-cookies / tests / test_storages / test_storage_mongo.py View on Github external
def setUp(self):
        self.spider = Spider("foo")
        self.settings = Settings()
        self.settings.setmodule(default_settings)
        self.settings.setdict(self.local_settings)
        self.storage = MongoStorage(self.settings)
        self.storage.open_spider(self.spider)
github furas / python-examples / scrapy / __template__ / python - scrapy.py View on Github external
#!/usr/bin/env python3

# date: 2019.12.07

import scrapy
from scrapy.pipelines.files import FilesPipeline
from scrapy.pipelines.images import ImagesPipeline
#from scrapy.commands.view import open_in_browser
#import json

class MySpider(scrapy.Spider):

    name = 'myspider'

    #allowed_domains = []

    # see page created for scraping: http://toscrape.com/
    start_urls = ['http://books.toscrape.com/'] #'http://quotes.toscrape.com']

    #def __init__(self, urls, *args, **kwargs):
    #    '''generate start_urls list'''
    #    super().__init__(*args, **kwargs)
    #    self.start_urls = urls.split(';')

    #def start_requests(self):
    #    '''generate requests instead of using start_ulrs'''
    #    self.url_template = http://quotes.toscrape.com/tag/{}/page/{}/
github alltheplaces / alltheplaces / locations / spiders / michaelkors.py View on Github external
import scrapy
import re
from locations.items import GeojsonPointItem

class MichaelkorsSpider(scrapy.Spider):

    name = "michaelkors"
    allowed_domains = ["locations.michaelkors.com"]
    download_delay = 0.5
    start_urls = (
        'https://locations.michaelkors.com/index.html',
    )

    def parse_stores(self, response):
        ref = re.findall(r"[^(\/)]+$", response.url)
        if (len(ref) > 0):
            ref = ref[0].split('.')[0]
        properties = {
            'addr_full': response.xpath('//meta[@itemprop="streetAddress"]/@content').extract_first(),
            'phone': response.xpath(
                'normalize-space(//span[@itemprop="telephone"]/text())').extract_first(),
github mu-editor / mu / utils / flask_api.py View on Github external
"""
The scrapy and beautifulsoup4 packages must be installed.

Usage:

scrapy runspider flask_api.py -o flask.json

"""
import scrapy
from bs4 import BeautifulSoup


URL = "http://flask.pocoo.org/docs/1.0/api/"


class BottleSpider(scrapy.Spider):
    name = "BottleSpider"
    start_urls = [URL]

    def parse(self, response):
        """
        Scrapes the list of modules associated with Bottle. Causes
        scrapy to follow the links to the module docs and uses a different
        parser to extract the API information contained therein.
        """
        # Find all the function definitions on the page:
        for func in response.css("dl.function"):
            # Class details are always first items in dl.
            func_spec = func.css("dt")[0]
            func_doc = func.css("dd")[0]
            # Function name is always first dt
            func_name = BeautifulSoup(
github Smartproxy / Smartproxy / scrapy / smartproxy_spider.py View on Github external
import scrapy
from w3lib.http import basic_auth_header

class SmartproxySpider(scrapy.Spider):
    name = "smartproxy"

    def start_requests(self):
        urls = [
            'https://www.whatismyip.com/'
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse,
                meta={'proxy': 'https://gate.smartproxy.com:7000'}, ## Your desired Endpoint
                headers={
                        'Proxy-Authorization': basic_auth_header(
                        'username', 'password') ## Your username and password for the proxy user
                }
            )

    def parse(self, response):
github YuLin12345 / Sneaker-Notify / main / main.py View on Github external
def parse(self, response):
        products = Selector(response).xpath('//div[@class="category-products"]//li[@class="item last"]')

        for product in products:
            item = BaitItem()
            item['name'] = product.xpath('a/@title').extract()[0]
            item['link'] = product.xpath('a/@href').extract()[0]
            # item['image'] = product.xpath('a/img/@src').extract()[0]
            item['size'] = '**NOT SUPPORTED YET**'
            yield item

        yield Request(BaitURL, callback=self.parse, dont_filter=True, priority=83)
		
		
class BlendsSpider(Spider):
    
    name = "BlendsSpider"
    allowded_domains = ["blendsus.com"]
    start_urls = [BlendsURL]
    
    def __init__(self):
        logging.critical("BlendsSpider STARTED.")

    def parse(self, response):
        products = Selector(response).xpath('//div[@class="row"]//div[contains(@class,"product-index desktop-3 mobile-3")]')

        for product in products:
            item = BlendsItem()
            item['name'] = product.xpath('.//a/@title').extract()[0]
            item['link'] = "https://www.blendsus.com" + product.xpath('.//a/@href').extract()[0]
            # item['image'] = "https:" + product.xpath('.//a/img/@src').extract()[0]
github alltheplaces / alltheplaces / locations / spiders / thebarrecode.py View on Github external
# -*- coding: utf-8 -*-
import scrapy
import json
from locations.items import GeojsonPointItem

class TheBarreCodeSpider(scrapy.Spider):
    name = "thebarrecode"
    allowed_domains = ["thebarrecode.com"]
    start_urls = (
        'http://www.thebarrecode.com/',
    )

    def parse(self, response):
        for location_url in response.xpath('//h4[@class="studio-location-name"]/a[1]/@href').extract():
            yield scrapy.Request(
                location_url,
                callback=self.parse_location,
            )

    def parse_location(self, response):
        properties = {
            'addr_full': response.xpath('//h4[@class="studio-address"]/span[@class="street"]/text()').extract_first(),
github lettleprince / BuddhaSpider / BuddhaSpider / spiders / buddha_spider.py View on Github external
import logging
import random
from buddha_item import BuddhaItem
from utils.data_store import DataStore
import math
import time


date_string = time.strftime("%Y_%m_%d", time.localtime())
logging.basicConfig(
    filename=('buddha_%s.log' % (date_string)),
    level=logging.DEBUG, filemode='w')
logger = logging.getLogger(__name__)


class BuddhaSpider(scrapy.Spider):
    # BuddhaSpider

    name = "buddha"
    start_urls = [
        'http://91porn.com/v.php?next=watch',  # 全部视频
        'http://91porn.com/v.php?category=rf'  # 最近加精
        ]
    # start_urls = ['https://www.zhihu.com/signin']
    # start_urls = ['https://twitter.com/']
    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip,deflate",
        "Accept-Language": "zh-CN,zh;q=0.8",
        "Connection": "keep-alive",
        "Content-Type": " application/x-www-form-urlencoded; charset=UTF-8",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \
github dangsh / hive / scrapySpider / weibo / weibo / spiders / w2.py View on Github external
# -*- coding: utf-8 -*-
import scrapy
import json
from weibo.items import WeiboItem
import time

class WiwiSpider(scrapy.Spider):
    count = 1
    name = 'ozzy'
    # start_urls = ['http://www.ip181.com/']

    cookie = {
        'T_WM' : '6f33fb05e2825da195d0d8a6bb2c1c21',
        'SCF' : 'AvOqQOQouYuqAqpXBdvflU6utIfkJOWAVWbxzfMTv9j0Rg4G - wmUILsLjnzZCdb6pH5frhsKWxrut4hANcM - 2FM.',
        'SUB' : '_2A253etQMDeRhGeBM61EY8irEyzSIHXVUhPxErDV6PUJbkdAKLRjwkW1NRQw8HJf_5QkMzzCj - zPl - Xu6TGlqc5kJ',
        'WEIBOCN_WM' : '90112_90001',
        'WEIBOCN_FROM' : '1110006030',
        'M_WEIBOCN_PARAMS' : 'featurecode % 3DH5tuiguang0623 % 26oid % 3D4205574042439353 % 26luicode % 3D10000011 % 26lfid % 3D102803 % 26uicode % 3D20000061 % 26fid % 3D4205574042439353'
    }

    def start_requests(self):
        for i in range(1000):
            url = 'https://m.weibo.cn/api/container/getIndex?containerid=102803&client=h5&featurecode=H5tuiguang0623&need_head_cards=1&wm=90112_90001&since_id='+ str(i+1)
github alltheplaces / alltheplaces / locations / spiders / staples.py View on Github external
import datetime
from locations.items import GeojsonPointItem
from locations.hours import OpeningHours


DAY_MAPPING = {
    'Mon': 'Mo',
    'Tue': 'Tu',
    'Wed': 'We',
    'Thu': 'Th',
    'Fri': 'Fr',
    'Sat': 'Sa',
    'Sun': 'Su'
}

class StaplesSpider(scrapy.Spider):

    name = "staples"
    allowed_domains = ["stores.staples.com"]
    start_urls = (
        'https://stores.staples.com/',
    )

    def parse_hours(self, elements):
        opening_hours = OpeningHours()

        for elem in elements:
            day = elem.xpath('.//td[@class="c-hours-details-row-day"]/text()').extract_first()
            intervals = elem.xpath('.//td[@class="c-hours-details-row-intervals"]')

            if intervals.xpath('./text()').extract_first() == "Closed":
                continue