How to use the ruia.Item function in ruia

To help you get started, we’ve selected a few ruia examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github howie6879 / ruia / tests / test_item.py View on Github external
class DoubanItem(Item):
    title = TextField(css_select="head title")
    constant_attr = "hello ruia"

    async def clean_title(self, title):
        return "Title: " + title


class DoubanCleanMethodErrorItem(Item):
    title = TextField(css_select="head title")

    def clean_title(self, title):
        return "Title: " + title


class DoubanIgnoreItem(Item):
    title = TextField(css_select="head title")

    async def clean_title(self, title):
        raise IgnoreThisItem


class HackerNewsItem(Item):
    target_item = TextField(css_select="tr.athing")
    title = TextField(css_select="a.storylink")
    url = AttrField(css_select="a.storylink", attr="href")


async def parse_item(html):
    items = []
    async for item in DoubanItems.get_items(html=html):
        items.append(item)
github howie6879 / ruia / tests / test_item.py View on Github external
#!/usr/bin/env python

import asyncio
import os

from ruia import AttrField, Item, TextField
from ruia.exceptions import IgnoreThisItem, InvalidFuncType

html_path = os.path.join(
    os.path.dirname(os.path.realpath(__file__)), "data", "for_item_testing.html"
)
with open(html_path, mode="r", encoding="utf-8") as file:
    HTML = file.read()


class DoubanItems(Item):
    target_item = TextField(css_select="div.item")
    title = TextField(css_select="span.title")
    cover = AttrField(css_select="div.pic>a>img", attr="src")
    abstract = TextField(css_select="span.inq")

    async def clean_title(self, title):
        if isinstance(title, str):
            return title
        else:
            return "".join([i.text.strip().replace("\xa0", "") for i in title])


class DoubanItem(Item):
    title = TextField(css_select="head title")
    constant_attr = "hello ruia"
github howie6879 / ruia / tests / test_spider.py View on Github external
async def retry_func(request):
    request.request_config["TIMEOUT"] = 10


@middleware.request
async def print_on_request(spider_ins, request):
    request.headers = {"User-Agent": "ruia ua"}


@middleware.response
async def print_on_response(spider_ins, request, response):
    assert isinstance(response.html, str)
    assert request.headers == {"User-Agent": "ruia ua"}


class ItemDemo(Item):
    title = TextField(xpath_select="/html/head/title")


class SpiderDemo(Spider):
    start_urls = ["https://httpbin.org/get?p=0"]
    request_config = {"RETRIES": 3, "DELAY": 0, "TIMEOUT": 20}
    headers = {"User-Agent": "Ruia Spider"}

    call_nums = 0

    async def parse(self, response):
        yield Request(
            url=response.url,
            callback=self.parse_item,
            headers=self.headers,
            request_config=self.request_config,
github howie6879 / owllook / owllook / spiders / qidian_ranking.py View on Github external
#!/usr/bin/env python
import asyncio
import time

from ruia import Spider, Item, AttrField, HtmlField, TextField
from ruia_ua import middleware

from owllook.database.mongodb import MotorBaseOld


class RankingItem(Item):
    target_item = TextField(css_select='.rank-list')
    ranking_title = TextField(css_select='h3.wrap-title')
    more = AttrField(css_select='h3>a.more', attr='href')
    book_list = HtmlField(css_select='div.book-list>ul>li', many=True)

    async def clean_ranking_title(self, ranking_title):
        if isinstance(ranking_title, list):
            return ranking_title[0].text
        else:
            return str(ranking_title).split('榜')[0] + '榜'

    async def clean_more(self, more):
        return "https:" + more


class NameItem(Item):
github howie6879 / owllook / owllook / spiders / qidian_all_novels.py View on Github external
from owllook.database.mongodb import MotorBase
from owllook.spiders.middlewares import owl_middleware

try:
    import uvloop

    asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
    pass

loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)


class QidianNovelsItem(Item):
    target_item = TextField(css_select='ul.all-img-list>li')
    novel_url = AttrField(css_select='div.book-img-box>a', attr='href')
    novel_name = TextField(css_select='div.book-mid-info>h4')
    novel_author = TextField(css_select='div.book-mid-info>p.author>a.name')
    novel_author_home_url = AttrField(css_select='div.book-mid-info>p.author>a.name', attr='href')
    novel_type = TextField(css_select='div.book-mid-info > p.author > a:nth-child(4)')
    novel_cover = AttrField(css_select='div.book-img-box img', attr='src')
    novel_abstract = TextField(css_select='div.book-mid-info p.intro')

    # novel_latest_chapter = TextField(css_select='div.bookupdate a')

    async def clean_novel_url(self, novel_url):
        return 'https:' + novel_url

    async def clean_novel_author(self, novel_author):
        if isinstance(novel_author, list):
github python-ruia / ruia-pyppeteer / example / jianshu_js_example.py View on Github external
#!/usr/bin/env python
"""
 Created by howie.hu at 2018/9/8.
"""

from ruia import AttrField, TextField, Item

from ruia_pyppeteer import PyppeteerSpider as Spider


class JianshuItem(Item):
    target_item = TextField(css_select="ul.list>li")
    author_name = TextField(css_select="a.name")
    author_url = AttrField(attr="href", css_select="a.name")

    async def clean_author_name(selfself, author_name):
        return author_name.strip()

    async def clean_author_url(self, author_url):
        return f"https://www.jianshu.com{author_url}"


class JianshuSpider(Spider):
    start_urls = ["https://www.jianshu.com/"]
    concurrency = 10

    async def parse(self, response):
github howie6879 / ruia / examples / topics_examples / hacker_news_spider.py View on Github external
#!/usr/bin/env python
"""
 Target: https://news.ycombinator.com/
 pip install aiofiles
"""
import aiofiles

from ruia import AttrField, TextField, Item, Spider


class HackerNewsItem(Item):
    target_item = TextField(css_select="tr.athing")
    title = TextField(css_select="a.storylink")
    url = AttrField(css_select="a.storylink", attr="href")

    async def clean_title(self, value):
        return value.strip()


class HackerNewsSpider(Spider):
    start_urls = [
        "https://news.ycombinator.com/news?p=1",
        "https://news.ycombinator.com/news?p=2",
    ]
    concurrency = 10

    async def parse(self, response):
github howie6879 / ruia / examples / simple_spider / python_documentation_spider.py View on Github external
import asyncio
import sys

from ruia import Item, TextField, AttrField


class PythonDocumentationItem(Item):
    title = TextField(css_select="title")
    tutorial_link = AttrField(xpath_select="//a[text()='Tutorial']", attr="href")


async def field_extraction():
    url = "https://docs.python.org/3/"
    item = await PythonDocumentationItem.get_item(url=url)
    print(item.title)
    print(item.tutorial_link)


if __name__ == "__main__":
    if sys.version_info[:2] == (3, 7):
        # Recommended for Python 3.7
        asyncio.run(field_extraction())
    else:
github howie6879 / owllook / owllook / spiders / zh_ranking.py View on Github external
import time

from ruia import Spider, Item, AttrField, HtmlField, TextField
from ruia_ua import middleware

from owllook.database.mongodb import MotorBaseOld


class RankingItem(Item):
    target_item = TextField(css_select='div.rank_i_p_list')
    ranking_title = TextField(css_select='div.rank_i_p_tit')
    more = AttrField(css_select='div.rank_i_more a', attr='href')
    book_list = HtmlField(css_select='div.rank_i_p_list>div.rank_i_li', many=True)


class NameItem(Item):
    top_name = TextField(css_select='div.rank_i_bname a.rank_i_l_a_book', default='')
    other_name = TextField(css_select='div.rank_i_bname a', default='')


class ZHRankingSpider(Spider):
    start_urls = ['http://book.zongheng.com/rank.html']

    concurrency = 3

    async def parse(self, res):
        result = []
        res_dic = {}

        async for item in RankingItem.get_items(html=res.html):
            each_book_list = []
            # 只取排名前十的书籍数据
github howie6879 / ruia / examples / simple_spider / douban_spider.py View on Github external
#!/usr/bin/env python

from ruia import AttrField, Item, Spider, TextField


class DoubanItem(Item):
    target_item = TextField(css_select="div.item")
    title = TextField(css_select="span.title")
    cover = AttrField(css_select="div.pic>a>img", attr="src")
    abstract = TextField(css_select="span.inq", default="")

    async def clean_title(self, title):
        if isinstance(title, str):
            return title
        else:
            return "".join([i.text.strip().replace("\xa0", "") for i in title])


class DoubanSpider(Spider):
    start_urls = ["https://movie.douban.com/top250"]
    request_config = {"RETRIES": 3, "DELAY": 0, "TIMEOUT": 20}
    concurrency = 10