How to use ruia - 10 common examples

To help you get started, we’ve selected a few ruia examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github howie6879 / ruia / tests / test_field.py View on Github external
def test_css_select():
    field = TextField(css_select="head title")
    value = field.extract(html_etree=html_etree)
    assert value == "ruia"
github howie6879 / ruia / tests / for_doc / apis / test_field.py View on Github external
def test_attr_field():
    title = ruia.AttrField(css_select=".title", attr="href", default="Untitled")
    assert title.extract(html_etree=html) == "/"
    tags = ruia.AttrField(css_select=".tag", attr="href", default="No tag", many=True)
    assert tags.extract(html_etree=html)[0] == "./easy.html"
github howie6879 / ruia / tests / for_doc / apis / test_field.py View on Github external
def test_attr_field():
    title = ruia.AttrField(css_select=".title", attr="href", default="Untitled")
    assert title.extract(html_etree=html) == "/"
    tags = ruia.AttrField(css_select=".tag", attr="href", default="No tag", many=True)
    assert tags.extract(html_etree=html)[0] == "./easy.html"
github howie6879 / ruia / tests / test_item.py View on Github external
def clean_title(self, title):
        return "Title: " + title


class DoubanIgnoreItem(Item):
    title = TextField(css_select="head title")

    async def clean_title(self, title):
        raise IgnoreThisItem


class HackerNewsItem(Item):
    target_item = TextField(css_select="tr.athing")
    title = TextField(css_select="a.storylink")
    url = AttrField(css_select="a.storylink", attr="href")


async def parse_item(html):
    items = []
    async for item in DoubanItems.get_items(html=html):
        items.append(item)
    return items


async def error_parse_item(html):
    items = []
    async for item in DoubanItem.get_items(html=html):
        items.append(item)
    return items
github howie6879 / ruia / tests / test_spider.py View on Github external
def test_no_start_url_spider():
    try:

        class NoStartUrlSpider(Spider):
            pass

        NoStartUrlSpider.start()
    except Exception as e:
        assert isinstance(e, ValueError)
github howie6879 / ruia / tests / test_spider.py View on Github external
def test_callback_error():
    class NoParseSpider(Spider):
        start_urls = ["https://httpbin.org/get"]

    NoParseSpider.start()

    class CallbackError(Spider):
        start_urls = ["https://httpbin.org/get"]

        async def parse(self, response):
            raise ValueError("error")

    CallbackError.start()
github howie6879 / ruia / tests / test_spider.py View on Github external
def test_spider_hook_error():
    class SpiderDemo(Spider):
        start_urls = ["https://httpbin.org/get?p=0"]

        async def parse(self, response):
            pass

    async def before_stop_func(spider_ins):
        raise TypeError("error")

    loop = asyncio.new_event_loop()
    SpiderDemo.start(loop=loop, before_stop=before_stop_func)
github howie6879 / ruia / tests / test_spider.py View on Github external
def test_spider_hook():
    async def after_start_func(spider_ins):
        print("after_start_func")
        spider_ins.result["after_start"] = True
        assert isinstance(spider_ins.result, dict)

    async def before_stop_func(spider_ins):
        print("before_stop_func")
        spider_ins.result["before_stop"] = True

    class SpiderHook(Spider):
        start_urls = ["https://httpbin.org/get?p=0", "https://httpbin.org/404"]
        request_config = {"RETRIES": 1, "DELAY": 0, "TIMEOUT": 10}

        result = {
            "after_start": False,
            "before_stop": False,
            "process_succeed_response": False,
            "process_failed_response": False,
            "process_item": False,
        }

        async def parse(self, response):
            item = await ItemDemo.get_item(html=HTML)
            yield item

        async def process_item(self, item):
github howie6879 / ruia / tests / test_item.py View on Github external
class DoubanItem(Item):
    title = TextField(css_select="head title")
    constant_attr = "hello ruia"

    async def clean_title(self, title):
        return "Title: " + title


class DoubanCleanMethodErrorItem(Item):
    title = TextField(css_select="head title")

    def clean_title(self, title):
        return "Title: " + title


class DoubanIgnoreItem(Item):
    title = TextField(css_select="head title")

    async def clean_title(self, title):
        raise IgnoreThisItem


class HackerNewsItem(Item):
    target_item = TextField(css_select="tr.athing")
    title = TextField(css_select="a.storylink")
    url = AttrField(css_select="a.storylink", attr="href")


async def parse_item(html):
    items = []
    async for item in DoubanItems.get_items(html=html):
        items.append(item)
github howie6879 / ruia / tests / test_item.py View on Github external
#!/usr/bin/env python

import asyncio
import os

from ruia import AttrField, Item, TextField
from ruia.exceptions import IgnoreThisItem, InvalidFuncType

html_path = os.path.join(
    os.path.dirname(os.path.realpath(__file__)), "data", "for_item_testing.html"
)
with open(html_path, mode="r", encoding="utf-8") as file:
    HTML = file.read()


class DoubanItems(Item):
    target_item = TextField(css_select="div.item")
    title = TextField(css_select="span.title")
    cover = AttrField(css_select="div.pic>a>img", attr="src")
    abstract = TextField(css_select="span.inq")

    async def clean_title(self, title):
        if isinstance(title, str):
            return title
        else:
            return "".join([i.text.strip().replace("\xa0", "") for i in title])


class DoubanItem(Item):
    title = TextField(css_select="head title")
    constant_attr = "hello ruia"