Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def setUp(self):
self.spider = Spider("foo")
self.settings = Settings()
self.settings.setmodule(default_settings)
self.settings.setdict(self.local_settings)
self.storage = MongoStorage(self.settings)
self.storage.open_spider(self.spider)
#!/usr/bin/env python3
# date: 2019.12.07
import scrapy
from scrapy.pipelines.files import FilesPipeline
from scrapy.pipelines.images import ImagesPipeline
#from scrapy.commands.view import open_in_browser
#import json
class MySpider(scrapy.Spider):
name = 'myspider'
#allowed_domains = []
# see page created for scraping: http://toscrape.com/
start_urls = ['http://books.toscrape.com/'] #'http://quotes.toscrape.com']
#def __init__(self, urls, *args, **kwargs):
# '''generate start_urls list'''
# super().__init__(*args, **kwargs)
# self.start_urls = urls.split(';')
#def start_requests(self):
# '''generate requests instead of using start_ulrs'''
# self.url_template = http://quotes.toscrape.com/tag/{}/page/{}/
import scrapy
import re
from locations.items import GeojsonPointItem
class MichaelkorsSpider(scrapy.Spider):
name = "michaelkors"
allowed_domains = ["locations.michaelkors.com"]
download_delay = 0.5
start_urls = (
'https://locations.michaelkors.com/index.html',
)
def parse_stores(self, response):
ref = re.findall(r"[^(\/)]+$", response.url)
if (len(ref) > 0):
ref = ref[0].split('.')[0]
properties = {
'addr_full': response.xpath('//meta[@itemprop="streetAddress"]/@content').extract_first(),
'phone': response.xpath(
'normalize-space(//span[@itemprop="telephone"]/text())').extract_first(),
"""
The scrapy and beautifulsoup4 packages must be installed.
Usage:
scrapy runspider flask_api.py -o flask.json
"""
import scrapy
from bs4 import BeautifulSoup
URL = "http://flask.pocoo.org/docs/1.0/api/"
class BottleSpider(scrapy.Spider):
name = "BottleSpider"
start_urls = [URL]
def parse(self, response):
"""
Scrapes the list of modules associated with Bottle. Causes
scrapy to follow the links to the module docs and uses a different
parser to extract the API information contained therein.
"""
# Find all the function definitions on the page:
for func in response.css("dl.function"):
# Class details are always first items in dl.
func_spec = func.css("dt")[0]
func_doc = func.css("dd")[0]
# Function name is always first dt
func_name = BeautifulSoup(
import scrapy
from w3lib.http import basic_auth_header
class SmartproxySpider(scrapy.Spider):
name = "smartproxy"
def start_requests(self):
urls = [
'https://www.whatismyip.com/'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse,
meta={'proxy': 'https://gate.smartproxy.com:7000'}, ## Your desired Endpoint
headers={
'Proxy-Authorization': basic_auth_header(
'username', 'password') ## Your username and password for the proxy user
}
)
def parse(self, response):
def parse(self, response):
products = Selector(response).xpath('//div[@class="category-products"]//li[@class="item last"]')
for product in products:
item = BaitItem()
item['name'] = product.xpath('a/@title').extract()[0]
item['link'] = product.xpath('a/@href').extract()[0]
# item['image'] = product.xpath('a/img/@src').extract()[0]
item['size'] = '**NOT SUPPORTED YET**'
yield item
yield Request(BaitURL, callback=self.parse, dont_filter=True, priority=83)
class BlendsSpider(Spider):
name = "BlendsSpider"
allowded_domains = ["blendsus.com"]
start_urls = [BlendsURL]
def __init__(self):
logging.critical("BlendsSpider STARTED.")
def parse(self, response):
products = Selector(response).xpath('//div[@class="row"]//div[contains(@class,"product-index desktop-3 mobile-3")]')
for product in products:
item = BlendsItem()
item['name'] = product.xpath('.//a/@title').extract()[0]
item['link'] = "https://www.blendsus.com" + product.xpath('.//a/@href').extract()[0]
# item['image'] = "https:" + product.xpath('.//a/img/@src').extract()[0]
# -*- coding: utf-8 -*-
import scrapy
import json
from locations.items import GeojsonPointItem
class TheBarreCodeSpider(scrapy.Spider):
name = "thebarrecode"
allowed_domains = ["thebarrecode.com"]
start_urls = (
'http://www.thebarrecode.com/',
)
def parse(self, response):
for location_url in response.xpath('//h4[@class="studio-location-name"]/a[1]/@href').extract():
yield scrapy.Request(
location_url,
callback=self.parse_location,
)
def parse_location(self, response):
properties = {
'addr_full': response.xpath('//h4[@class="studio-address"]/span[@class="street"]/text()').extract_first(),
import logging
import random
from buddha_item import BuddhaItem
from utils.data_store import DataStore
import math
import time
date_string = time.strftime("%Y_%m_%d", time.localtime())
logging.basicConfig(
filename=('buddha_%s.log' % (date_string)),
level=logging.DEBUG, filemode='w')
logger = logging.getLogger(__name__)
class BuddhaSpider(scrapy.Spider):
# BuddhaSpider
name = "buddha"
start_urls = [
'http://91porn.com/v.php?next=watch', # 全部视频
'http://91porn.com/v.php?category=rf' # 最近加精
]
# start_urls = ['https://www.zhihu.com/signin']
# start_urls = ['https://twitter.com/']
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip,deflate",
"Accept-Language": "zh-CN,zh;q=0.8",
"Connection": "keep-alive",
"Content-Type": " application/x-www-form-urlencoded; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \
# -*- coding: utf-8 -*-
import scrapy
import json
from weibo.items import WeiboItem
import time
class WiwiSpider(scrapy.Spider):
count = 1
name = 'ozzy'
# start_urls = ['http://www.ip181.com/']
cookie = {
'T_WM' : '6f33fb05e2825da195d0d8a6bb2c1c21',
'SCF' : 'AvOqQOQouYuqAqpXBdvflU6utIfkJOWAVWbxzfMTv9j0Rg4G - wmUILsLjnzZCdb6pH5frhsKWxrut4hANcM - 2FM.',
'SUB' : '_2A253etQMDeRhGeBM61EY8irEyzSIHXVUhPxErDV6PUJbkdAKLRjwkW1NRQw8HJf_5QkMzzCj - zPl - Xu6TGlqc5kJ',
'WEIBOCN_WM' : '90112_90001',
'WEIBOCN_FROM' : '1110006030',
'M_WEIBOCN_PARAMS' : 'featurecode % 3DH5tuiguang0623 % 26oid % 3D4205574042439353 % 26luicode % 3D10000011 % 26lfid % 3D102803 % 26uicode % 3D20000061 % 26fid % 3D4205574042439353'
}
def start_requests(self):
for i in range(1000):
url = 'https://m.weibo.cn/api/container/getIndex?containerid=102803&client=h5&featurecode=H5tuiguang0623&need_head_cards=1&wm=90112_90001&since_id='+ str(i+1)
import datetime
from locations.items import GeojsonPointItem
from locations.hours import OpeningHours
DAY_MAPPING = {
'Mon': 'Mo',
'Tue': 'Tu',
'Wed': 'We',
'Thu': 'Th',
'Fri': 'Fr',
'Sat': 'Sa',
'Sun': 'Su'
}
class StaplesSpider(scrapy.Spider):
name = "staples"
allowed_domains = ["stores.staples.com"]
start_urls = (
'https://stores.staples.com/',
)
def parse_hours(self, elements):
opening_hours = OpeningHours()
for elem in elements:
day = elem.xpath('.//td[@class="c-hours-details-row-day"]/text()').extract_first()
intervals = elem.xpath('.//td[@class="c-hours-details-row-intervals"]')
if intervals.xpath('./text()').extract_first() == "Closed":
continue