How to use the scrapy.selector.Selector function in Scrapy

To help you get started, we’ve selected a few Scrapy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github inspirehep / hepcrawl / tests / unit / test_t2k.py View on Github external
def record():
    """Return results from the T2K spider."""
    spider = t2k_spider.T2kSpider()
    response = fake_response_from_file('t2k/test_1.html')
    selector = Selector(response, type='html')
    nodes = selector.xpath('//%s' % spider.itertag)
    spider.domain = "file:///tests/responses/t2k/"
    parsed_node = spider.parse_node(response, nodes[0])

    splash_response = fake_response_from_file('t2k/001.html')
    splash_response.meta["date"] = parsed_node.meta["date"]
    splash_response.meta["title"] = parsed_node.meta["title"]
    splash_response.meta["urls"] = parsed_node.meta["urls"]
    splash_response.meta["authors"] = parsed_node.meta["authors"]

    parsed_item = spider.scrape_for_pdf(splash_response).next()
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
github scrapy / scrapy / tests / test_selector.py View on Github external
def test_badly_encoded_body(self):
        # \xe9 alone isn't valid utf8 sequence
        r1 = TextResponse('http://www.example.com',
                          body=b'<p>an Jos\xe9 de</p>',
                          encoding='utf-8')
        Selector(r1).xpath('//text()').getall()
github inspirehep / hepcrawl / tests / unit / test_base.py View on Github external
def urls():
    spider = base_spider.BaseSpider()
    response = fake_response_from_file('base/test_1.xml')
    selector = Selector(response, type='xml')
    spider._register_namespaces(selector)
    nodes = selector.xpath('.//%s' % spider.itertag)
    return spider.get_urls_in_record(nodes[0])
github heyrict / VPNGater / vpngater.py View on Github external
def get_ovpn(url, save_to):
    page = Selector(text=request.urlopen(url).read())\
            .xpath('.//ul[@class="listBigArrow"]/li/a')
    cururl = url.rsplit('/',2)[0]
    for link in page:
        if link.xpath('./strong/text()').extract_first().find('UDP') > 0:
            download(cururl+link.xpath('./@href').extract_first(), save_to+'UDP.ovpn')
        elif link.xpath('./strong/text()').extract_first().find('TCP') > 0:
            download(cururl+link.xpath('./@href').extract_first(), save_to+'TCP.ovpn')
        else:
            download(cururl+link.xpath('./@href').extract_first(), save_to+'.ovpn')
github stamhe / spider_scrapy_lianjia / spider_scrapy_lianjia / spiders / dianpingxmtgymspider.py View on Github external
def parse_content(self, response):
        self.log("=================================================")
        self.log("cat_url = %s proxy = %s" % (response.meta['cat_url'], response.meta['proxy']))
        chenshi_name= response.meta['city_id']
        sel = Selector(response)

        item = SpiderDianpingXmtItem()

        shop_type   = response.meta['shop_type']
        shop_url    = response.url
        http_status = response.status

        self.log("chenshi_name = %s" % chenshi_name)
        self.log("shop_url = %s" % shop_url)
        self.log("shop_type = %s" % shop_type)
        self.log("http_status = %s" % http_status)

	x = sel.xpath('//div[@id="basic-info"]')
        if len(x) > 0:
            shop_name   = x[0].xpath('h1[@class="shop-name"]/text()').extract()[0].strip()
github maxliaops / scrapy-itzhaopin / itzhaopin / itzhaopin / spiders / tencent_spider.py View on Github external
def parse_item(self, response):
        items = []
        sel = Selector(response)
        base_url = get_base_url(response)
        sites_even = sel.css('table.tablelist tr.even')
        for site in sites_even:
            item = TencentItem()
            item['name'] = site.css('.l.square a').xpath('text()').extract()[0]
            relative_url = site.css('.l.square a').xpath('@href').extract()[0]
            item['detailLink'] = urljoin_rfc(base_url, relative_url)
            item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()[0]
            item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()[0]
            item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()[0]
            item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()[0]
            items.append(item)
            #print repr(item).decode("unicode-escape") + '\n'

        sites_odd = sel.css('table.tablelist tr.odd')
        for site in sites_odd:
github scrapinghub / portia / slybot / slybot / linkextractor / xml.py View on Github external
def _extract_links(self, response):
        body = response.body_as_unicode()
        _type = 'html'
        if body.lstrip().startswith('
github alltheplaces / alltheplaces / locations / spiders / walgreens.py View on Github external
def parse(self, response):
        xml = Selector(response)
        xml.remove_namespaces()

        urls = xml.xpath('//loc/text()').extract()

        for url in urls:
            yield scrapy.Request(
                response.urljoin(url),
                callback=self.parse_store
            )
github tapaswenipathak / STW-Collection / ScrapScrapy / ScrapScrapy / spiders / SSSpider.py View on Github external
def parse(self, response):
        sel = Selector(response)
        item = ScrapscrapyItem()
        item['Heading'] = str(
            sel.xpath('/html/body/div[2]/div/div[1]/div/div[1]/h1').extract())
        item['Content'] = str(
            sel.xpath('/html/body/div[2]/div/div[1]/div/div[1]/p/text()').extract())
        item['Source_Website'] = "http://scrapy.org"
        return item
github tshua / ArticalRecommand / data_spider / data_spider / spiders / toutiao.py View on Github external
def parseSourceUrl(self, response):
        '''解析文章 发送items给pipeline'''
        article_content = response.xpath("//div [@id='article-main']").extract()
        if (article_content):
            toutiaoItem = ToutiaoItem()
            toutiaoItem['title'] = \
                Selector(text=article_content[0]).xpath('//h1 [@class="article-title"]/text()').extract()[0]
            toutiaoItem['source'] = '头条'
            toutiaoItem['tag'] = Selector(text=article_content[0]).xpath('//li [@class="label-item"]/text()').extract()
            toutiaoItem['title_hash'] = data_spider.common.get_md5_value(toutiaoItem['title'].encode("utf-8"))
            toutiaoItem['artical_url'] = str(toutiaoItem['title_hash']) + ".html"
            article_time = Selector(text=article_content[0]).xpath('//span [@class="time"]/text()').extract()
            toutiaoItem['artical_time'] = ''
            if (article_time):
                toutiaoItem['artical_time'] = article_time[0]
            toutiaoItem['collect_time'] = int(time.time())
            article_content, number  =  re.subn(r"href=\".*\"", '', article_content[0])
            article_content = bytes(article_content, "utf-8")
            with open("./html/" + toutiaoItem['artical_url'], 'wb') as f:
                f.write(article_content)
            yield toutiaoItem