How to use the scrapy.item.Field function in Scrapy

To help you get started, we’ve selected a few Scrapy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github geekan / scrapy-examples / zhihu / zhihu / items.py View on Github external
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy.item import Item, Field

class ZhihuPeopleItem(Item):
    # define the fields for your item here like:
    id = Field()
    name = Field()
    sign = Field()
    location = Field()
    business = Field()
    employment = Field()
    position = Field()
    education = Field()
    education_extra = Field()
    description = Field()
    agree = Field()
    thanks = Field()
    asks = Field()
    answers = Field()
    posts = Field()
    collections = Field()
    logs = Field()
    followees = Field()
    followers = Field()
    follow_topics = Field()
github Blender3D / Proxy-Scraper / proxies / items.py View on Github external
from scrapy.item import Item, Field

from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.exporter import BaseItemExporter
from scrapy.contrib.loader.processor import MapCompose, TakeFirst

class Proxy(Item):
    address = Field()
    port = Field()

class ProxyItemLoader(XPathItemLoader):
    default_output_processor = TakeFirst()
    
    address_in = MapCompose(unicode, unicode.strip)
    port_in = MapCompose(int)

class IPPortItemExporter(BaseItemExporter):
	def __init__(self, file, **kwargs):
		self._configure(kwargs, dont_fail=True)
		self.file = file

	def export_item(self, item):
		return self.file.write('{item[address]}:{item[port]}\n'.format(item=item))
github geekan / scrapy-examples / sis / sis / items.py View on Github external
bottomline = Field()
    duty = Field()
    xxx = Field()

class SisForumListItem(Item):
    content = Field() # raw content with all html
    title = Field()
    thread_type = Field()
    author = Field()
    post_time = Field()
    link = Field()
    star = Field()
    comment = Field()
    view = Field()
    size = Field()
    video_type = Field()
    last_post_time = Field()
github carpedm20 / voxoffice / scrapy / tutorial / spiders / spider.py View on Github external
__author__ = 'carpedm20'
__date__ = '2014.07.25'

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector

# http://movie.naver.com/movie/sdb/rank/rmovie.nhn?sel=cnt&date=20050207&tg=0

from scrapy.item import Item, Field

class Movie(Item):
    name = Field()
    url = Field()
    rank = Field()
    date = Field()

#tgs = range(20)
#tgs.remove(9)

def make_urls(tg):
    url = "http://movie.naver.com/movie/sdb/rank/rmovie.nhn?sel=cnt&date=%s&tg=%s"
    urls = []

    from datetime import date, timedelta

    current_date = date(2005, 2, 7)
    end_date = date.today()
    delta = timedelta(days=1)

    while current_date <= end_date:
        urls.append(url % (current_date.strftime("%Y%m%d"), tg))
github giorgosera / growth-hacking-toolkit / crawlers / appannie / appannie / items.py View on Github external
from scrapy.item import Item, Field

class AndroidAppItem(Item):
    name = Field()
    category = Field()
    company = Field()
    email = Field()
    developer_website = Field()
    min_downloads = Field()
    max_downloads = Field()
    store_url = Field()
    is_free = Field()
github kprestel / py-investment / pytech / crawler / items.py View on Github external
fiscal_year = Field()
    end_date = Field()

    revenues = Field()
    investment_revenues = Field()
    op_income = Field()
    net_income = Field()
    gross_profit = Field()
    interest_expense = Field()
    research_and_dev_expense = Field()

    eps_basic = Field()
    eps_diluted = Field()

    dividend = Field()

    # Taxes
    tax_expense = Field()
    net_taxes_paid = Field()

    # Balance sheet stuffs
    assets = Field()
    cur_assets = Field()
    acts_pay_current = Field()
    acts_receive_current = Field()
    acts_receive_noncurrent = Field()
    total_liabilities = Field()
    total_liabilities_equity = Field()
    shares_outstanding = Field()
    shares_outstanding_diluted = Field()
    common_stock_outstanding = Field()
github kprestel / py-investment / pytech / crawler / items.py View on Github external
# Cash flow from operating, investing, and financing
    cash_flow_op = Field()
    cash_flow_inv = Field()
    cash_flow_fin = Field()


class PriceItem(Item):
    # Trading symbol
    symbol = Field()

    # YYYY-MM-DD
    date = Field()

    open = Field()
    close = Field()
    high = Field()
    low = Field()
    adj_close = Field()
    volume = Field()


class SymbolItem(Item):
    symbol = Field()
    name = Field()
github openstack / openstack-doc-tools / sitemap / generator / spiders / sitemap_file.py View on Github external
import re
import time
try:
    import urlparse
except ImportError:
    import urllib.parse as urlparse

from scrapy import item
from scrapy import linkextractors
from scrapy import spiders


class SitemapItem(item.Item):
    '''Class to represent an item in the sitemap.'''
    loc = item.Field()
    lastmod = item.Field()
    priority = item.Field()
    changefreq = item.Field()


class SitemapSpider(spiders.CrawlSpider):
    name = 'sitemap'

    MAINT_SERIES = [
        'newton',
        'ocata',
        'pike',
        'queens',
        'rocky',
        'stein',
    ]