How to use the ruia.Spider function in ruia

To help you get started, we’ve selected a few ruia examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github howie6879 / ruia / tests / test_spider.py View on Github external
def test_no_start_url_spider():
    try:

        class NoStartUrlSpider(Spider):
            pass

        NoStartUrlSpider.start()
    except Exception as e:
        assert isinstance(e, ValueError)
github howie6879 / ruia / tests / test_spider.py View on Github external
def test_callback_error():
    class NoParseSpider(Spider):
        start_urls = ["https://httpbin.org/get"]

    NoParseSpider.start()

    class CallbackError(Spider):
        start_urls = ["https://httpbin.org/get"]

        async def parse(self, response):
            raise ValueError("error")

    CallbackError.start()
github howie6879 / ruia / tests / test_spider.py View on Github external
def test_spider_hook_error():
    class SpiderDemo(Spider):
        start_urls = ["https://httpbin.org/get?p=0"]

        async def parse(self, response):
            pass

    async def before_stop_func(spider_ins):
        raise TypeError("error")

    loop = asyncio.new_event_loop()
    SpiderDemo.start(loop=loop, before_stop=before_stop_func)
github howie6879 / ruia / tests / test_spider.py View on Github external
def test_spider_hook():
    async def after_start_func(spider_ins):
        print("after_start_func")
        spider_ins.result["after_start"] = True
        assert isinstance(spider_ins.result, dict)

    async def before_stop_func(spider_ins):
        print("before_stop_func")
        spider_ins.result["before_stop"] = True

    class SpiderHook(Spider):
        start_urls = ["https://httpbin.org/get?p=0", "https://httpbin.org/404"]
        request_config = {"RETRIES": 1, "DELAY": 0, "TIMEOUT": 10}

        result = {
            "after_start": False,
            "before_stop": False,
            "process_succeed_response": False,
            "process_failed_response": False,
            "process_item": False,
        }

        async def parse(self, response):
            item = await ItemDemo.get_item(html=HTML)
            yield item

        async def process_item(self, item):
github howie6879 / owllook / owllook / spiders / qidian_ranking.py View on Github external
async def clean_ranking_title(self, ranking_title):
        if isinstance(ranking_title, list):
            return ranking_title[0].text
        else:
            return str(ranking_title).split('榜')[0] + '榜'

    async def clean_more(self, more):
        return "https:" + more


class NameItem(Item):
    top_name = TextField(css_select='h4', default='')
    other_name = TextField(css_select='a.name', default='')


class QidianRankingSpider(Spider):
    start_urls = [f"https://www.qidian.com/rank/?chn={key}" for key in
                  [-1, 21, 1, 2, 22, 4, 15, 6, 5, 7, 8, 9, 10, 12]]

    concurrency = 3
    qidian_type = {
        '-1': '全部类别',
        '21': '玄幻',
        '1': '奇幻',
        '2': '武侠',
        '22': '仙侠',
        '4': '都市',
        '15': '职场',
        '6': '军事',
        '5': '历史',
        '7': '游戏',
        '8': '体育',
github howie6879 / owllook / owllook / spiders / heiyan_novel_info.py View on Github external
else:
            return cover.replace('http', 'https')

    async def clean_novels_type(self, novels_type):
        types_dict = {
            '社会': '都市'
        }
        print(types_dict.get(str(novels_type).strip(), novels_type))
        return types_dict.get(str(novels_type).strip(), novels_type)

    async def clean_latest_chapter_time(self, latest_chapter_time):
        return latest_chapter_time.replace(u'今天', str(time.strftime("%Y-%m-%d ", time.localtime()))).replace(u'昨日', str(
            time.strftime("%Y-%m-%d ", time.localtime(time.time() - 24 * 60 * 60))))


class HYNovelInfoSpider(Spider):
    start_urls = []
    request_config = {
        'RETRIES': 3,
        'TIMEOUT': 10
    }

    async def parse(self, res):
        self.motor_db = MotorBase(loop=self.loop).get_db()
        item = await HYNovelInfoItem.get_item(html=res.html)

        item_data = {
            'novel_name': item.novel_name,
            'author': item.author,
            'cover': item.cover,
            'abstract': item.abstract,
            'status': item.status,
github howie6879 / ruia / examples / topics_examples / hacker_news_spider.py View on Github external
"""
import aiofiles

from ruia import AttrField, TextField, Item, Spider


class HackerNewsItem(Item):
    target_item = TextField(css_select="tr.athing")
    title = TextField(css_select="a.storylink")
    url = AttrField(css_select="a.storylink", attr="href")

    async def clean_title(self, value):
        return value.strip()


class HackerNewsSpider(Spider):
    start_urls = [
        "https://news.ycombinator.com/news?p=1",
        "https://news.ycombinator.com/news?p=2",
    ]
    concurrency = 10

    async def parse(self, response):
        async for item in HackerNewsItem.get_items(html=response.html):
            yield item

    async def process_item(self, item: HackerNewsItem):
        async with aiofiles.open("./hacker_news.txt", "a") as f:
            await f.write(str(item.title) + "\n")


if __name__ == "__main__":
github howie6879 / ruia / examples / topics_examples / retry_demo.py View on Github external
#!/usr/bin/env python

from ruia import Spider


async def retry_func(request):
    request.request_config["TIMEOUT"] = 10


class RetryDemo(Spider):
    start_urls = ["http://httpbin.org/get"]

    request_config = {
        "RETRIES": 3,
        "DELAY": 0,
        "TIMEOUT": 0.1,
        "RETRY_FUNC": retry_func,
    }

    async def parse(self, response):
        pages = ["http://httpbin.org/get?p=1", "http://httpbin.org/get?p=2"]
        async for resp in self.multiple_request(pages):
            yield self.parse_item(response=resp)

    async def parse_item(self, response):
        json_data = await response.json()
github howie6879 / owllook / owllook / spiders / zongheng_all_novels.py View on Github external
async def clean_novel_author(self, novel_author):
        if novel_author:
            if isinstance(novel_author, list):
                novel_author = novel_author[0].text
            return novel_author
        else:
            return ''

            # def tal_novel_author_home_url(self, novel_author_home_url):
            #     if isinstance(novel_author_home_url, list):
            #         novel_author_home_url = novel_author_home_url[0].get('href').strip()
            #     return 'http:' + novel_author_home_url


class ZHNovelsSpider(Spider):
    start_urls = ['http://book.zongheng.com/store/c0/c0/b9/u0/p1/v9/s9/t0/ALL.html']

    request_config = {
        'RETRIES': 8,
        'DELAY': 0,
        'TIMEOUT': 3
    }
    concurrency = 60
    motor_db = MotorBase(loop=loop).get_db()

    async def parse(self, res):
        items_data = await ZHNovelsItem.get_items(html=res.html)
        tasks = []
        for item in items_data:
            if item.novel_url:
                res_dic = {
github howie6879 / owllook / owllook / spiders / qidian_all_novels.py View on Github external
async def clean_novel_author(self, novel_author):
        if isinstance(novel_author, list):
            novel_author = novel_author[0].text
        return novel_author

    async def clean_novel_author_home_url(self, novel_author_home_url):
        if isinstance(novel_author_home_url, list):
            novel_author_home_url = novel_author_home_url[0].get('href').strip()
        return 'https:' + novel_author_home_url

    async def clean_novel_cover(self, novel_cover):
        return 'https:' + novel_cover


class QidianNovelsSpider(Spider):
    # start_urls = ['https://www.qidian.com/all?page=1']

    request_config = {
        'RETRIES': 15,
        'DELAY': 0,
        'TIMEOUT': 3
    }
    concurrency = 20
    motor_db = MotorBase(loop=loop).get_db()

    async def parse(self, res):
        items_data = await QidianNovelsItem.get_items(html=res.html)
        tasks = []
        for item in items_data:
            res_dic = {
                'novel_url': item.novel_url,