How to use the scrapy.loader.processors.MapCompose function in Scrapy

To help you get started, we’ve selected a few Scrapy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github scalingexcellence / scrapybook / ch05 / properties / properties / spiders / api.py View on Github external
""" This function parses a property page.

        @url http://web:9312/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """

        # Create the loader using the response
        l = ItemLoader(item=PropertiesItem(), response=response)

        # Load fields using XPath expressions
        l.add_value('title', response.meta['title'],
                    MapCompose(unicode.strip, unicode.title))
        l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
                    MapCompose(lambda i: i.replace(',', ''), float),
                    re='[,.0-9]+')
        l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
                    MapCompose(unicode.strip), Join())
        l.add_xpath('address',
                    '//*[@itemtype="http://schema.org/Place"][1]/text()',
                    MapCompose(unicode.strip))
        l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
                    MapCompose(lambda i: urlparse.urljoin(response.url, i)))

        # Housekeeping fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())
github kprestel / py-investment / pytech / crawler / loaders.py View on Github external
op_income_in = MapCompose(MatchEndDate(float))
    op_income_out = Compose(imd_filter_member, imd_mult, imd_get_op_income)

    eps_basic_in = MapCompose(MatchEndDate(float))
    eps_basic_out = Compose(ImdSumMembersOr(imd_get_per_share_value), lambda x: x if x < MAX_PER_SHARE_VALUE else None)

    eps_diluted_in = MapCompose(MatchEndDate(float))
    eps_diluted_out = Compose(ImdSumMembersOr(imd_get_per_share_value), lambda x: x if x < MAX_PER_SHARE_VALUE else None)

    dividend_in = MapCompose(MatchEndDate(float))
    dividend_out = Compose(imd_get_per_share_value, lambda x: x if x < MAX_PER_SHARE_VALUE and x > 0.0 else 0.0)

    assets_in = MapCompose(MatchEndDate(float))
    assets_out = Compose(imd_filter_member, imd_mult, imd_max)

    cur_assets_in = MapCompose(MatchEndDate(float))
    cur_assets_out = Compose(imd_filter_member, imd_mult, imd_max)

    cur_liab_in = MapCompose(MatchEndDate(float))
    cur_liab_out = Compose(imd_filter_member, imd_mult, imd_max)

    equity_in = MapCompose(MatchEndDate(float))
    equity_out = Compose(imd_filter_member, imd_mult, imd_get_equity)

    cash_in = MapCompose(MatchEndDate(float))
    cash_out = Compose(imd_filter_member, imd_mult, imd_max)

    cash_flow_op_in = MapCompose(MatchEndDate(float, True))
    cash_flow_op_out = Compose(imd_filter_member, imd_mult, imd_get_cash_flow)

    cash_flow_inv_in = MapCompose(MatchEndDate(float, True))
    cash_flow_inv_out = Compose(imd_filter_member, imd_mult, imd_get_cash_flow)
github kprestel / py-investment / pytech / crawler / loaders.py View on Github external
net_income_in = MapCompose(MatchEndDate(float))
    net_income_out = Compose(imd_filter_member, imd_mult, imd_get_net_income)

    op_income_in = MapCompose(MatchEndDate(float))
    op_income_out = Compose(imd_filter_member, imd_mult, imd_get_op_income)

    eps_basic_in = MapCompose(MatchEndDate(float))
    eps_basic_out = Compose(ImdSumMembersOr(imd_get_per_share_value), lambda x: x if x < MAX_PER_SHARE_VALUE else None)

    eps_diluted_in = MapCompose(MatchEndDate(float))
    eps_diluted_out = Compose(ImdSumMembersOr(imd_get_per_share_value), lambda x: x if x < MAX_PER_SHARE_VALUE else None)

    dividend_in = MapCompose(MatchEndDate(float))
    dividend_out = Compose(imd_get_per_share_value, lambda x: x if x < MAX_PER_SHARE_VALUE and x > 0.0 else 0.0)

    assets_in = MapCompose(MatchEndDate(float))
    assets_out = Compose(imd_filter_member, imd_mult, imd_max)

    cur_assets_in = MapCompose(MatchEndDate(float))
    cur_assets_out = Compose(imd_filter_member, imd_mult, imd_max)

    cur_liab_in = MapCompose(MatchEndDate(float))
    cur_liab_out = Compose(imd_filter_member, imd_mult, imd_max)

    equity_in = MapCompose(MatchEndDate(float))
    equity_out = Compose(imd_filter_member, imd_mult, imd_get_equity)

    cash_in = MapCompose(MatchEndDate(float))
    cash_out = Compose(imd_filter_member, imd_mult, imd_max)

    cash_flow_op_in = MapCompose(MatchEndDate(float, True))
    cash_flow_op_out = Compose(imd_filter_member, imd_mult, imd_get_cash_flow)
github scalingexcellence / scrapybook / ch03 / properties / properties / spiders / basic.py View on Github external
def parse(self, response):
        """ This function parses a property page.

        @url http://web:9312/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """

        # Create the loader using the response
        l = ItemLoader(item=PropertiesItem(), response=response)

        # Load fields using XPath expressions
        l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
                    MapCompose(unicode.strip, unicode.title))
        l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
                    MapCompose(lambda i: i.replace(',', ''), float),
                    re='[,.0-9]+')
        l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
                    MapCompose(unicode.strip), Join())
        l.add_xpath('address',
                    '//*[@itemtype="http://schema.org/Place"][1]/text()',
                    MapCompose(unicode.strip))
        l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
                    MapCompose(lambda i: urlparse.urljoin(response.url, i)))

        # Housekeeping fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
github lahoffm / aclu-bail-reform / src / webscraper / gwinnett / gwinnett / items.py View on Github external
timestamp_out = MapCompose(format_timestamp)
    inmate_lastname_in = MapCompose(str.strip)
    inmate_firstname_in = MapCompose(str.strip)
    inmate_middlename_in = MapCompose(str.strip)
    inmate_sex_in = MapCompose(str.strip, parse_sex)
    inmate_race_in = MapCompose(str.strip, parse_race)
    inmate_age_in = MapCompose(str.strip, int)
    inmate_dob_in = MapCompose(str.strip)
    inmate_address_in = MapCompose(str.strip)
    booking_timestamp_in = MapCompose(str.strip, parse_timestamp)
    booking_timestamp_out = MapCompose(format_timestamp)
    release_timestamp_in = MapCompose(str.strip)
    processing_numbers_in = MapCompose(str.strip)
    agency_in = MapCompose(str.strip)
    facility_in = MapCompose(str.strip)
    charges_out = Join(' | ')
    severity_in = MapCompose(str.strip, parse_severity)
    severity_out = Join(' | ')
    bond_amount_in = MapCompose(str.strip)
    current_status_in = MapCompose(str.strip)
    court_dates_in = MapCompose(str.strip)
    days_jailed_in = MapCompose(str.strip)
    other_in = MapCompose(str.strip)
github kprestel / py-investment / pytech / crawler / loaders.py View on Github external
comprehensive_income_net_of_tax_in = MapCompose(MatchEndDate(float))
    comprehensive_income_net_of_tax_out = Compose(imd_filter_member, imd_mult, imd_max)

    research_and_dev_expense_in = MapCompose(MatchEndDate(float))
    research_and_dev_expense_out = Compose(imd_filter_member, imd_mult, imd_max)

    warranty_accrual_in = MapCompose(MatchEndDate(float))
    warranty_accrual_out = Compose(imd_filter_member, imd_mult, imd_max)

    warranty_accrual_payments_in = MapCompose(MatchEndDate(float))
    warranty_accrual_payments_out = Compose(imd_filter_member, imd_mult, imd_max)

    net_income_in = MapCompose(MatchEndDate(float))
    net_income_out = Compose(imd_filter_member, imd_mult, imd_get_net_income)

    op_income_in = MapCompose(MatchEndDate(float))
    op_income_out = Compose(imd_filter_member, imd_mult, imd_get_op_income)

    eps_basic_in = MapCompose(MatchEndDate(float))
    eps_basic_out = Compose(ImdSumMembersOr(imd_get_per_share_value), lambda x: x if x < MAX_PER_SHARE_VALUE else None)

    eps_diluted_in = MapCompose(MatchEndDate(float))
    eps_diluted_out = Compose(ImdSumMembersOr(imd_get_per_share_value), lambda x: x if x < MAX_PER_SHARE_VALUE else None)

    dividend_in = MapCompose(MatchEndDate(float))
    dividend_out = Compose(imd_get_per_share_value, lambda x: x if x < MAX_PER_SHARE_VALUE and x > 0.0 else 0.0)

    assets_in = MapCompose(MatchEndDate(float))
    assets_out = Compose(imd_filter_member, imd_mult, imd_max)

    cur_assets_in = MapCompose(MatchEndDate(float))
    cur_assets_out = Compose(imd_filter_member, imd_mult, imd_max)
github hackfengJam / ArticleSpider / ArticleSpider / items.py View on Github external
class LagouJobItemLoader(ItemLoader):
    # 自定义ItemLoader
    default_out_processor = TakeFirst()


class LagouJobItem(scrapy.Item):
    # 拉钩网职位信息
    url = scrapy.Field()
    url_object_id = scrapy.Field()
    title = scrapy.Field()
    salary = scrapy.Field()
    job_city = scrapy.Field(
        input_processor=MapCompose(remove_splash),
    )
    work_years = scrapy.Field(
        input_processor=MapCompose(remove_splash),
    )
    degree_need = scrapy.Field(
        input_processor=MapCompose(remove_splash),
    )
    job_type = scrapy.Field()
    publish_time = scrapy.Field()
    tags = scrapy.Field(
        input_processor=MapCompose(Join(","))
    )
    job_advantage = scrapy.Field()
    job_desc = scrapy.Field()
    job_addr = scrapy.Field(
        input_processor=MapCompose(remove_tags),
    )
    company_url = scrapy.Field()
    company_name = scrapy.Field()
github eltermann / ssp-transparencia / ssptransparencia / ssptransparencia / items.py View on Github external
instrucao = scrapy.Field()
    cutis = scrapy.Field()
    naturezas_envolvidas = scrapy.Field()


def map_month(s):
    month = s.lower()
    month = month[:3]
    _m = dict(jan=1, fev=2, mar=3, abr=4, mai=5, jun=6,
              jul=7, ago=8, set=9, out=10, nov=11, dez=12)
    return _m[month]


class SsptransparenciaBOLoader(scrapy.loader.ItemLoader):
    default_output_processor = TakeFirst()
    default_input_processor = MapCompose(unicode.strip)

    nav_ano_in = MapCompose(unicode.strip, int)
    nav_mes_in = MapCompose(unicode.strip, map_month)
    bo_numero_naturezas_in = MapCompose(unicode.strip, int)
    bo_numero_vitimas_in = MapCompose(unicode.strip, int)


class SsptransparenciaNaturezaLoader(scrapy.loader.ItemLoader):
    default_output_processor = TakeFirst()
    default_input_processor = MapCompose(unicode.strip)

    count_in = MapCompose(int)


class SsptransparenciaVitimaLoader(scrapy.loader.ItemLoader):
    default_output_processor = TakeFirst()
github scalingexcellence / scrapybook-2nd-edition / ch03 / properties / properties / spiders / easy.py View on Github external
""" This function parses a property page.

        @url http://scrapybook.s3.amazonaws.com:9312/properties/property_000000.html
        @returns items 1
        @scrapes title price description address image_urls
        @scrapes url project spider server date
        """

        # Create the loader using the response
        l = ItemLoader(item=PropertiesItem(), response=response)

        # Load fields using XPath expressions
        l.add_xpath('title', '//*[@itemprop="name"][1]/text()',
                    MapCompose(str.strip, str.title))
        l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
                    MapCompose(lambda i: i.replace(',', ''), float),
                    re='[,.0-9]+')
        l.add_xpath('description', '//*[@itemprop="description"][1]/text()',
                    MapCompose(str.strip), Join())
        l.add_xpath('address',
                    '//*[@itemtype="http://schema.org/Place"][1]/text()',
                    MapCompose(str.strip))
        l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src',
                    MapCompose(lambda i: response.urljoin(i)))

        # Housekeeping fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())
github scalingexcellence / scrapybook-2nd-edition / ch07 / properties / properties / spiders / fast.py View on Github external
# Load fields using XPath expressions
        l.add_xpath('title', './/*[@itemprop="name"][1]/text()',
                    MapCompose(str.strip, str.title))
        l.add_xpath('price', './/*[@itemprop="price"][1]/text()',
                    MapCompose(lambda i: i.replace(',', ''), float),
                    re='[,.0-9]+')
        l.add_xpath('description',
                    './/*[@itemprop="description"][1]/text()',
                    MapCompose(str.strip), Join())
        l.add_xpath('address',
                    './/*[@itemtype="http://schema.org/Place"]'
                    '[1]/*/text()',
                    MapCompose(str.strip))
        make_url = lambda i: response.urljoin(i)
        l.add_xpath('image_urls', './/*[@itemprop="image"][1]/@src',
                    MapCompose(make_url))

        # Housekeeping fields
        l.add_xpath('url', './/*[@itemprop="url"][1]/@href',
                    MapCompose(make_url))
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()