How to use the ruia.TextField function in ruia

To help you get started, we’ve selected a few ruia examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github howie6879 / ruia / tests / test_field.py View on Github external
def test_css_select():
    field = TextField(css_select="head title")
    value = field.extract(html_etree=html_etree)
    assert value == "ruia"
github howie6879 / owllook / owllook / spiders / zh_ranking.py View on Github external
from ruia import Spider, Item, AttrField, HtmlField, TextField
from ruia_ua import middleware

from owllook.database.mongodb import MotorBaseOld


class RankingItem(Item):
    target_item = TextField(css_select='div.rank_i_p_list')
    ranking_title = TextField(css_select='div.rank_i_p_tit')
    more = AttrField(css_select='div.rank_i_more a', attr='href')
    book_list = HtmlField(css_select='div.rank_i_p_list>div.rank_i_li', many=True)


class NameItem(Item):
    top_name = TextField(css_select='div.rank_i_bname a.rank_i_l_a_book', default='')
    other_name = TextField(css_select='div.rank_i_bname a', default='')


class ZHRankingSpider(Spider):
    start_urls = ['http://book.zongheng.com/rank.html']

    concurrency = 3

    async def parse(self, res):
        result = []
        res_dic = {}

        async for item in RankingItem.get_items(html=res.html):
            each_book_list = []
            # 只取排名前十的书籍数据
            for index, value in enumerate(item.book_list[:10]):
                item_data = await NameItem.get_item(html=value)
github howie6879 / ruia / examples / hacker_news_spider / items.py View on Github external
#!/usr/bin/env python

from ruia import AttrField, TextField, Item


class HackerNewsItem(Item):
    """
    定义目标字段抓取规则
    """

    target_item = TextField(css_select="tr.athing")
    title = TextField(css_select="a.storylink")
    url = AttrField(css_select="a.storylink", attr="href")

    async def clean_title(self, value):
        """
        清洗目标数据
        :param value: 初始目标数据
        :return:
        """
        return str(value).strip()
github howie6879 / ruia / examples / topics_examples / ruia_demo.py View on Github external
from ruia import TextField, Item, Spider

class HackerNewsItem(Item):
    target_item = TextField(css_select='tr.athing')
    title = TextField(css_select='a.storylink')


class HackerNewsSpider(Spider):
    start_urls = ['https://news.ycombinator.com/news?p=1']

    async def parse(self, response):
        async for item in HackerNewsItem.get_items(html=response.html):
            yield item

if __name__ == '__main__':
    HackerNewsSpider.start()
github howie6879 / ruia / examples / topics_examples / hacker_news_item.py View on Github external
#!/usr/bin/env python
"""
 Target: https://news.ycombinator.com/
"""
import asyncio

from ruia import AttrField, TextField, Item


class HackerNewsItem(Item):
    target_item = TextField(css_select="tr.athing")
    title = TextField(css_select="a.storylink")
    url = AttrField(css_select="a.storylink", attr="href")

    async def clean_title(self, value):
        return value.strip()


async def single_page_demo(url="https://news.ycombinator.com/"):
    async for item in HackerNewsItem.get_items(url=url):
        print(item)


async def multiple_page_demo():
    pages = [
        single_page_demo(f"https://news.ycombinator.com/news?p={page}")
        for page in range(1, 3)
github howie6879 / owllook / owllook / spiders / qidian_honor_spider.py View on Github external
Created by howie.hu at 11/03/2018.
 获取起点荣誉数据,如:https://book.qidian.com/honor/1009704712
 荣誉类型:
  - 推荐票
  - 收藏
  - 点击
"""

from ruia import Spider, Item, TextField
from ruia_ua import middleware


class QidianHonorItem(Item):
    target_item = TextField(css_select='li.cf')
    honor_text = TextField(css_select='span.decs')
    honor_time = TextField(css_select='span.time')


class QidianHonorSpider(Spider):
    start_urls = ['https://book.qidian.com/honor/1009531496']

    request_config = {
        'RETRIES': 3,
        'DELAY': 0,
        'TIMEOUT': 10
    }

    async def parse(self, res):
        items_data = await QidianHonorItem.get_items(html=res.html)
        click_list, col_list, rec_list, other_list = [], [], [], []
        for item in items_data:
            data = {
github howie6879 / owllook / owllook / spiders / zongheng_all_novels.py View on Github external
from owllook.database.mongodb import MotorBase
from owllook.spiders.middlewares import owl_middleware

try:
    import uvloop

    asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
    pass

loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)


class ZHNovelsItem(Item):
    target_item = TextField(css_select='div.store_collist div.bookbox')
    novel_url = AttrField(css_select='div.bookinfo div.bookname a', attr='href')
    novel_name = TextField(css_select='div.bookinfo div.bookname a')
    novel_author = TextField(css_select='div.bookilnk a:nth-child(1)')
    novel_author_home_url = AttrField(css_select='div.bookilnk a:nth-child(1)', attr='href')
    novel_type = TextField(css_select='div.bookilnk a:nth-child(2)')
    novel_cover = AttrField(css_select='div.bookimg img', attr='src')
    novel_abstract = TextField(css_select='div.bookintro')
    novel_latest_chapter = TextField(css_select='div.bookupdate a')

    # def tal_novel_url(self, novel_url):
    # return 'http:' + novel_url

    async def clean_novel_author(self, novel_author):
        if novel_author:
            if isinstance(novel_author, list):
                novel_author = novel_author[0].text
github howie6879 / ruia / examples / typical_spider / hacker_news_spider.py View on Github external
import asyncio
import aiofiles
from ruia import Item, TextField, AttrField, Spider


class HackerNewsItem(Item):
    target_item = TextField(css_select='tr.athing')
    title = TextField(css_select='a.storylink')
    url = AttrField(css_select='a.storylink', attr='href')


class HackerNewsSpider(Spider):
    concurrency = 2
    start_urls = [f'https://news.ycombinator.com/news?p={index}' for index in range(10)]

    async def parse(self, res):
        items = await HackerNewsItem.get_items(html=res.html)
        for item in items:
            async with aiofiles.open('./hacker_news.txt', mode='a', encoding='utf-8') as f:
                await f.write(item.title + '\n')


async def test_item():
github howie6879 / owllook / owllook / spiders / zongheng_all_novels.py View on Github external
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
    pass

loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)


class ZHNovelsItem(Item):
    target_item = TextField(css_select='div.store_collist div.bookbox')
    novel_url = AttrField(css_select='div.bookinfo div.bookname a', attr='href')
    novel_name = TextField(css_select='div.bookinfo div.bookname a')
    novel_author = TextField(css_select='div.bookilnk a:nth-child(1)')
    novel_author_home_url = AttrField(css_select='div.bookilnk a:nth-child(1)', attr='href')
    novel_type = TextField(css_select='div.bookilnk a:nth-child(2)')
    novel_cover = AttrField(css_select='div.bookimg img', attr='src')
    novel_abstract = TextField(css_select='div.bookintro')
    novel_latest_chapter = TextField(css_select='div.bookupdate a')

    # def tal_novel_url(self, novel_url):
    # return 'http:' + novel_url

    async def clean_novel_author(self, novel_author):
        if novel_author:
            if isinstance(novel_author, list):
                novel_author = novel_author[0].text
            return novel_author
        else:
            return ''

            # def tal_novel_author_home_url(self, novel_author_home_url):
github howie6879 / ruia / examples / simple_spider / douban_spider.py View on Github external
#!/usr/bin/env python

from ruia import AttrField, Item, Spider, TextField


class DoubanItem(Item):
    target_item = TextField(css_select="div.item")
    title = TextField(css_select="span.title")
    cover = AttrField(css_select="div.pic>a>img", attr="src")
    abstract = TextField(css_select="span.inq", default="")

    async def clean_title(self, title):
        if isinstance(title, str):
            return title
        else:
            return "".join([i.text.strip().replace("\xa0", "") for i in title])


class DoubanSpider(Spider):
    start_urls = ["https://movie.douban.com/top250"]
    request_config = {"RETRIES": 3, "DELAY": 0, "TIMEOUT": 20}
    concurrency = 10
    # proxy config
    # kwargs = {"proxy": "http://0.0.0.0:1087"}
    kwargs = {}