Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
""" This function parses a property page.
@url http://web:9312/properties/property_000000.html
@returns items 1
@scrapes title price description address image_urls
@scrapes url project spider server date
"""
# Create the loader using the response
l = ItemLoader(item=PropertiesItem(), response=response)
# Load fields using XPath expressions
l.add_value('title', response.meta['title'],
MapCompose(unicode.strip, unicode.title))
l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
MapCompose(lambda i: i.replace(',', ''), float),
re='[,.0-9]+')
l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
MapCompose(unicode.strip), Join())
l.add_xpath('address',
'//*[@itemtype="http://schema.org/Place"][1]/text()',
MapCompose(unicode.strip))
l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
MapCompose(lambda i: urlparse.urljoin(response.url, i)))
# Housekeeping fields
l.add_value('url', response.url)
l.add_value('project', self.settings.get('BOT_NAME'))
l.add_value('spider', self.name)
l.add_value('server', socket.gethostname())
l.add_value('date', datetime.datetime.now())
op_income_in = MapCompose(MatchEndDate(float))
op_income_out = Compose(imd_filter_member, imd_mult, imd_get_op_income)
eps_basic_in = MapCompose(MatchEndDate(float))
eps_basic_out = Compose(ImdSumMembersOr(imd_get_per_share_value), lambda x: x if x < MAX_PER_SHARE_VALUE else None)
eps_diluted_in = MapCompose(MatchEndDate(float))
eps_diluted_out = Compose(ImdSumMembersOr(imd_get_per_share_value), lambda x: x if x < MAX_PER_SHARE_VALUE else None)
dividend_in = MapCompose(MatchEndDate(float))
dividend_out = Compose(imd_get_per_share_value, lambda x: x if x < MAX_PER_SHARE_VALUE and x > 0.0 else 0.0)
assets_in = MapCompose(MatchEndDate(float))
assets_out = Compose(imd_filter_member, imd_mult, imd_max)
cur_assets_in = MapCompose(MatchEndDate(float))
cur_assets_out = Compose(imd_filter_member, imd_mult, imd_max)
cur_liab_in = MapCompose(MatchEndDate(float))
cur_liab_out = Compose(imd_filter_member, imd_mult, imd_max)
equity_in = MapCompose(MatchEndDate(float))
equity_out = Compose(imd_filter_member, imd_mult, imd_get_equity)
cash_in = MapCompose(MatchEndDate(float))
cash_out = Compose(imd_filter_member, imd_mult, imd_max)
cash_flow_op_in = MapCompose(MatchEndDate(float, True))
cash_flow_op_out = Compose(imd_filter_member, imd_mult, imd_get_cash_flow)
cash_flow_inv_in = MapCompose(MatchEndDate(float, True))
cash_flow_inv_out = Compose(imd_filter_member, imd_mult, imd_get_cash_flow)
net_income_in = MapCompose(MatchEndDate(float))
net_income_out = Compose(imd_filter_member, imd_mult, imd_get_net_income)
op_income_in = MapCompose(MatchEndDate(float))
op_income_out = Compose(imd_filter_member, imd_mult, imd_get_op_income)
eps_basic_in = MapCompose(MatchEndDate(float))
eps_basic_out = Compose(ImdSumMembersOr(imd_get_per_share_value), lambda x: x if x < MAX_PER_SHARE_VALUE else None)
eps_diluted_in = MapCompose(MatchEndDate(float))
eps_diluted_out = Compose(ImdSumMembersOr(imd_get_per_share_value), lambda x: x if x < MAX_PER_SHARE_VALUE else None)
dividend_in = MapCompose(MatchEndDate(float))
dividend_out = Compose(imd_get_per_share_value, lambda x: x if x < MAX_PER_SHARE_VALUE and x > 0.0 else 0.0)
assets_in = MapCompose(MatchEndDate(float))
assets_out = Compose(imd_filter_member, imd_mult, imd_max)
cur_assets_in = MapCompose(MatchEndDate(float))
cur_assets_out = Compose(imd_filter_member, imd_mult, imd_max)
cur_liab_in = MapCompose(MatchEndDate(float))
cur_liab_out = Compose(imd_filter_member, imd_mult, imd_max)
equity_in = MapCompose(MatchEndDate(float))
equity_out = Compose(imd_filter_member, imd_mult, imd_get_equity)
cash_in = MapCompose(MatchEndDate(float))
cash_out = Compose(imd_filter_member, imd_mult, imd_max)
cash_flow_op_in = MapCompose(MatchEndDate(float, True))
cash_flow_op_out = Compose(imd_filter_member, imd_mult, imd_get_cash_flow)
def parse(self, response):
""" This function parses a property page.
@url http://web:9312/properties/property_000000.html
@returns items 1
@scrapes title price description address image_urls
@scrapes url project spider server date
"""
# Create the loader using the response
l = ItemLoader(item=PropertiesItem(), response=response)
# Load fields using XPath expressions
l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
MapCompose(unicode.strip, unicode.title))
l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
MapCompose(lambda i: i.replace(',', ''), float),
re='[,.0-9]+')
l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
MapCompose(unicode.strip), Join())
l.add_xpath('address',
'//*[@itemtype="http://schema.org/Place"][1]/text()',
MapCompose(unicode.strip))
l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
MapCompose(lambda i: urlparse.urljoin(response.url, i)))
# Housekeeping fields
l.add_value('url', response.url)
l.add_value('project', self.settings.get('BOT_NAME'))
l.add_value('spider', self.name)
l.add_value('server', socket.gethostname())
timestamp_out = MapCompose(format_timestamp)
inmate_lastname_in = MapCompose(str.strip)
inmate_firstname_in = MapCompose(str.strip)
inmate_middlename_in = MapCompose(str.strip)
inmate_sex_in = MapCompose(str.strip, parse_sex)
inmate_race_in = MapCompose(str.strip, parse_race)
inmate_age_in = MapCompose(str.strip, int)
inmate_dob_in = MapCompose(str.strip)
inmate_address_in = MapCompose(str.strip)
booking_timestamp_in = MapCompose(str.strip, parse_timestamp)
booking_timestamp_out = MapCompose(format_timestamp)
release_timestamp_in = MapCompose(str.strip)
processing_numbers_in = MapCompose(str.strip)
agency_in = MapCompose(str.strip)
facility_in = MapCompose(str.strip)
charges_out = Join(' | ')
severity_in = MapCompose(str.strip, parse_severity)
severity_out = Join(' | ')
bond_amount_in = MapCompose(str.strip)
current_status_in = MapCompose(str.strip)
court_dates_in = MapCompose(str.strip)
days_jailed_in = MapCompose(str.strip)
other_in = MapCompose(str.strip)
comprehensive_income_net_of_tax_in = MapCompose(MatchEndDate(float))
comprehensive_income_net_of_tax_out = Compose(imd_filter_member, imd_mult, imd_max)
research_and_dev_expense_in = MapCompose(MatchEndDate(float))
research_and_dev_expense_out = Compose(imd_filter_member, imd_mult, imd_max)
warranty_accrual_in = MapCompose(MatchEndDate(float))
warranty_accrual_out = Compose(imd_filter_member, imd_mult, imd_max)
warranty_accrual_payments_in = MapCompose(MatchEndDate(float))
warranty_accrual_payments_out = Compose(imd_filter_member, imd_mult, imd_max)
net_income_in = MapCompose(MatchEndDate(float))
net_income_out = Compose(imd_filter_member, imd_mult, imd_get_net_income)
op_income_in = MapCompose(MatchEndDate(float))
op_income_out = Compose(imd_filter_member, imd_mult, imd_get_op_income)
eps_basic_in = MapCompose(MatchEndDate(float))
eps_basic_out = Compose(ImdSumMembersOr(imd_get_per_share_value), lambda x: x if x < MAX_PER_SHARE_VALUE else None)
eps_diluted_in = MapCompose(MatchEndDate(float))
eps_diluted_out = Compose(ImdSumMembersOr(imd_get_per_share_value), lambda x: x if x < MAX_PER_SHARE_VALUE else None)
dividend_in = MapCompose(MatchEndDate(float))
dividend_out = Compose(imd_get_per_share_value, lambda x: x if x < MAX_PER_SHARE_VALUE and x > 0.0 else 0.0)
assets_in = MapCompose(MatchEndDate(float))
assets_out = Compose(imd_filter_member, imd_mult, imd_max)
cur_assets_in = MapCompose(MatchEndDate(float))
cur_assets_out = Compose(imd_filter_member, imd_mult, imd_max)
class LagouJobItemLoader(ItemLoader):
# 自定义ItemLoader
default_out_processor = TakeFirst()
class LagouJobItem(scrapy.Item):
# 拉钩网职位信息
url = scrapy.Field()
url_object_id = scrapy.Field()
title = scrapy.Field()
salary = scrapy.Field()
job_city = scrapy.Field(
input_processor=MapCompose(remove_splash),
)
work_years = scrapy.Field(
input_processor=MapCompose(remove_splash),
)
degree_need = scrapy.Field(
input_processor=MapCompose(remove_splash),
)
job_type = scrapy.Field()
publish_time = scrapy.Field()
tags = scrapy.Field(
input_processor=MapCompose(Join(","))
)
job_advantage = scrapy.Field()
job_desc = scrapy.Field()
job_addr = scrapy.Field(
input_processor=MapCompose(remove_tags),
)
company_url = scrapy.Field()
company_name = scrapy.Field()
instrucao = scrapy.Field()
cutis = scrapy.Field()
naturezas_envolvidas = scrapy.Field()
def map_month(s):
month = s.lower()
month = month[:3]
_m = dict(jan=1, fev=2, mar=3, abr=4, mai=5, jun=6,
jul=7, ago=8, set=9, out=10, nov=11, dez=12)
return _m[month]
class SsptransparenciaBOLoader(scrapy.loader.ItemLoader):
default_output_processor = TakeFirst()
default_input_processor = MapCompose(unicode.strip)
nav_ano_in = MapCompose(unicode.strip, int)
nav_mes_in = MapCompose(unicode.strip, map_month)
bo_numero_naturezas_in = MapCompose(unicode.strip, int)
bo_numero_vitimas_in = MapCompose(unicode.strip, int)
class SsptransparenciaNaturezaLoader(scrapy.loader.ItemLoader):
default_output_processor = TakeFirst()
default_input_processor = MapCompose(unicode.strip)
count_in = MapCompose(int)
class SsptransparenciaVitimaLoader(scrapy.loader.ItemLoader):
default_output_processor = TakeFirst()
""" This function parses a property page.
@url http://scrapybook.s3.amazonaws.com:9312/properties/property_000000.html
@returns items 1
@scrapes title price description address image_urls
@scrapes url project spider server date
"""
# Create the loader using the response
l = ItemLoader(item=PropertiesItem(), response=response)
# Load fields using XPath expressions
l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
MapCompose(str.strip, str.title))
l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
MapCompose(lambda i: i.replace(',', ''), float),
re='[,.0-9]+')
l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
MapCompose(str.strip), Join())
l.add_xpath('address',
'//*[@itemtype="http://schema.org/Place"][1]/text()',
MapCompose(str.strip))
l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
MapCompose(lambda i: response.urljoin(i)))
# Housekeeping fields
l.add_value('url', response.url)
l.add_value('project', self.settings.get('BOT_NAME'))
l.add_value('spider', self.name)
l.add_value('server', socket.gethostname())
l.add_value('date', datetime.datetime.now())
# Load fields using XPath expressions
l.add_xpath('title', './/*[@itemprop="name"][1]/text()',
MapCompose(str.strip, str.title))
l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
MapCompose(lambda i: i.replace(',', ''), float),
re='[,.0-9]+')
l.add_xpath('description',
'.//*[@itemprop="description"][1]/text()',
MapCompose(str.strip), Join())
l.add_xpath('address',
'.//*[@itemtype="http://schema.org/Place"]'
'[1]/*/text()',
MapCompose(str.strip))
make_url = lambda i: response.urljoin(i)
l.add_xpath('image_urls', './/*[@itemprop="image"][1]/@src',
MapCompose(make_url))
# Housekeeping fields
l.add_xpath('url', './/*[@itemprop="url"][1]/@href',
MapCompose(make_url))
l.add_value('project', self.settings.get('BOT_NAME'))
l.add_value('spider', self.name)
l.add_value('server', socket.gethostname())
l.add_value('date', datetime.datetime.now())
return l.load_item()