How to use the ruia.AttrField function in ruia

To help you get started, we’ve selected a few ruia examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github howie6879 / ruia / tests / for_doc / apis / test_field.py View on Github external
def test_attr_field():
    title = ruia.AttrField(css_select=".title", attr="href", default="Untitled")
    assert title.extract(html_etree=html) == "/"
    tags = ruia.AttrField(css_select=".tag", attr="href", default="No tag", many=True)
    assert tags.extract(html_etree=html)[0] == "./easy.html"
github howie6879 / ruia / tests / for_doc / apis / test_field.py View on Github external
def test_attr_field():
    title = ruia.AttrField(css_select=".title", attr="href", default="Untitled")
    assert title.extract(html_etree=html) == "/"
    tags = ruia.AttrField(css_select=".tag", attr="href", default="No tag", many=True)
    assert tags.extract(html_etree=html)[0] == "./easy.html"
github howie6879 / ruia / tests / test_item.py View on Github external
def clean_title(self, title):
        return "Title: " + title


class DoubanIgnoreItem(Item):
    title = TextField(css_select="head title")

    async def clean_title(self, title):
        raise IgnoreThisItem


class HackerNewsItem(Item):
    target_item = TextField(css_select="tr.athing")
    title = TextField(css_select="a.storylink")
    url = AttrField(css_select="a.storylink", attr="href")


async def parse_item(html):
    items = []
    async for item in DoubanItems.get_items(html=html):
        items.append(item)
    return items


async def error_parse_item(html):
    items = []
    async for item in DoubanItem.get_items(html=html):
        items.append(item)
    return items
github howie6879 / owllook / owllook / spiders / heiyan_novel_info.py View on Github external
from pprint import pprint

from ruia import Spider, Item, TextField, AttrField
from ruia_ua import middleware as ua_middleware

from owllook.database.mongodb import MotorBase


class HYNovelInfoItem(Item):
    """
    定义继承自item的Item类
    """
    novel_name = AttrField(css_select="meta[property='og:title']", attr='content')
    author = AttrField(css_select="meta[property='og:novel:author']", attr='content')
    cover = AttrField(css_select="meta[property='og:image']", attr='content')
    abstract = AttrField(css_select="meta[property='og:description']", attr='content')
    status = AttrField(css_select="meta[property='og:novel:status']", attr='content')
    novels_type = AttrField(css_select="meta[property='og:novel:category']", attr='content')
    novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href')
    latest_chapter = AttrField(css_select="meta[property='og:novel:latest_chapter_name']", attr='content')
    latest_chapter_url = AttrField(css_select="meta[property='og:novel:latest_chapter_url']", attr='content')
    latest_chapter_time = AttrField(css_select="meta[property='og:novel:update_time']", attr='content')

    # novel_name = TextField(css_select='div.c-left>div.mod>div.hd>h2')
    # author = TextField(css_select='div.author-zone div.right a.name strong')
    # cover = AttrField(css_select='img.book-cover', attr='src')
    # abstract = TextField(css_select='pre.note')
    # status = ''
    # novels_type = TextField(css_select='div.c-left>div.mod>div.hd>p.infos>span.cate>a')
    # latest_chapter = ''
    # novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href')
github howie6879 / owllook / owllook / spiders / zongheng_all_novels.py View on Github external
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
    pass

loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)


class ZHNovelsItem(Item):
    target_item = TextField(css_select='div.store_collist div.bookbox')
    novel_url = AttrField(css_select='div.bookinfo div.bookname a', attr='href')
    novel_name = TextField(css_select='div.bookinfo div.bookname a')
    novel_author = TextField(css_select='div.bookilnk a:nth-child(1)')
    novel_author_home_url = AttrField(css_select='div.bookilnk a:nth-child(1)', attr='href')
    novel_type = TextField(css_select='div.bookilnk a:nth-child(2)')
    novel_cover = AttrField(css_select='div.bookimg img', attr='src')
    novel_abstract = TextField(css_select='div.bookintro')
    novel_latest_chapter = TextField(css_select='div.bookupdate a')

    # def tal_novel_url(self, novel_url):
    # return 'http:' + novel_url

    async def clean_novel_author(self, novel_author):
        if novel_author:
            if isinstance(novel_author, list):
                novel_author = novel_author[0].text
            return novel_author
        else:
            return ''

            # def tal_novel_author_home_url(self, novel_author_home_url):
            #     if isinstance(novel_author_home_url, list):
github howie6879 / owllook / owllook / spiders / heiyan_novel_info.py View on Github external
from owllook.database.mongodb import MotorBase


class HYNovelInfoItem(Item):
    """
    定义继承自item的Item类
    """
    novel_name = AttrField(css_select="meta[property='og:title']", attr='content')
    author = AttrField(css_select="meta[property='og:novel:author']", attr='content')
    cover = AttrField(css_select="meta[property='og:image']", attr='content')
    abstract = AttrField(css_select="meta[property='og:description']", attr='content')
    status = AttrField(css_select="meta[property='og:novel:status']", attr='content')
    novels_type = AttrField(css_select="meta[property='og:novel:category']", attr='content')
    novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href')
    latest_chapter = AttrField(css_select="meta[property='og:novel:latest_chapter_name']", attr='content')
    latest_chapter_url = AttrField(css_select="meta[property='og:novel:latest_chapter_url']", attr='content')
    latest_chapter_time = AttrField(css_select="meta[property='og:novel:update_time']", attr='content')

    # novel_name = TextField(css_select='div.c-left>div.mod>div.hd>h2')
    # author = TextField(css_select='div.author-zone div.right a.name strong')
    # cover = AttrField(css_select='img.book-cover', attr='src')
    # abstract = TextField(css_select='pre.note')
    # status = ''
    # novels_type = TextField(css_select='div.c-left>div.mod>div.hd>p.infos>span.cate>a')
    # latest_chapter = ''
    # novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href')

    async def clean_cover(self, cover):
        if 'https' in cover:
            return cover
        else:
github howie6879 / owllook / owllook / spiders / heiyan_novel_info.py View on Github external
class HYNovelInfoItem(Item):
    """
    定义继承自item的Item类
    """
    novel_name = AttrField(css_select="meta[property='og:title']", attr='content')
    author = AttrField(css_select="meta[property='og:novel:author']", attr='content')
    cover = AttrField(css_select="meta[property='og:image']", attr='content')
    abstract = AttrField(css_select="meta[property='og:description']", attr='content')
    status = AttrField(css_select="meta[property='og:novel:status']", attr='content')
    novels_type = AttrField(css_select="meta[property='og:novel:category']", attr='content')
    novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href')
    latest_chapter = AttrField(css_select="meta[property='og:novel:latest_chapter_name']", attr='content')
    latest_chapter_url = AttrField(css_select="meta[property='og:novel:latest_chapter_url']", attr='content')
    latest_chapter_time = AttrField(css_select="meta[property='og:novel:update_time']", attr='content')

    # novel_name = TextField(css_select='div.c-left>div.mod>div.hd>h2')
    # author = TextField(css_select='div.author-zone div.right a.name strong')
    # cover = AttrField(css_select='img.book-cover', attr='src')
    # abstract = TextField(css_select='pre.note')
    # status = ''
    # novels_type = TextField(css_select='div.c-left>div.mod>div.hd>p.infos>span.cate>a')
    # latest_chapter = ''
    # novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href')

    async def clean_cover(self, cover):
        if 'https' in cover:
            return cover
        else:
            return cover.replace('http', 'https')
github howie6879 / owllook / owllook / spiders / zh_ranking.py View on Github external
#!/usr/bin/env python
"""
 Created by howie.hu at 29/11/2017.
"""
import time

from ruia import Spider, Item, AttrField, HtmlField, TextField
from ruia_ua import middleware

from owllook.database.mongodb import MotorBaseOld


class RankingItem(Item):
    target_item = TextField(css_select='div.rank_i_p_list')
    ranking_title = TextField(css_select='div.rank_i_p_tit')
    more = AttrField(css_select='div.rank_i_more a', attr='href')
    book_list = HtmlField(css_select='div.rank_i_p_list>div.rank_i_li', many=True)


class NameItem(Item):
    top_name = TextField(css_select='div.rank_i_bname a.rank_i_l_a_book', default='')
    other_name = TextField(css_select='div.rank_i_bname a', default='')


class ZHRankingSpider(Spider):
    start_urls = ['http://book.zongheng.com/rank.html']

    concurrency = 3

    async def parse(self, res):
        result = []
        res_dic = {}
github howie6879 / owllook / owllook / spiders / qidian_all_novels.py View on Github external
from owllook.spiders.middlewares import owl_middleware

try:
    import uvloop

    asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
    pass

loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)


class QidianNovelsItem(Item):
    target_item = TextField(css_select='ul.all-img-list>li')
    novel_url = AttrField(css_select='div.book-img-box>a', attr='href')
    novel_name = TextField(css_select='div.book-mid-info>h4')
    novel_author = TextField(css_select='div.book-mid-info>p.author>a.name')
    novel_author_home_url = AttrField(css_select='div.book-mid-info>p.author>a.name', attr='href')
    novel_type = TextField(css_select='div.book-mid-info > p.author > a:nth-child(4)')
    novel_cover = AttrField(css_select='div.book-img-box img', attr='src')
    novel_abstract = TextField(css_select='div.book-mid-info p.intro')

    # novel_latest_chapter = TextField(css_select='div.bookupdate a')

    async def clean_novel_url(self, novel_url):
        return 'https:' + novel_url

    async def clean_novel_author(self, novel_author):
        if isinstance(novel_author, list):
            novel_author = novel_author[0].text
        return novel_author
github howie6879 / owllook / owllook / spiders / qidian_all_novels.py View on Github external
import uvloop

    asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
    pass

loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)


class QidianNovelsItem(Item):
    target_item = TextField(css_select='ul.all-img-list>li')
    novel_url = AttrField(css_select='div.book-img-box>a', attr='href')
    novel_name = TextField(css_select='div.book-mid-info>h4')
    novel_author = TextField(css_select='div.book-mid-info>p.author>a.name')
    novel_author_home_url = AttrField(css_select='div.book-mid-info>p.author>a.name', attr='href')
    novel_type = TextField(css_select='div.book-mid-info > p.author > a:nth-child(4)')
    novel_cover = AttrField(css_select='div.book-img-box img', attr='src')
    novel_abstract = TextField(css_select='div.book-mid-info p.intro')

    # novel_latest_chapter = TextField(css_select='div.bookupdate a')

    async def clean_novel_url(self, novel_url):
        return 'https:' + novel_url

    async def clean_novel_author(self, novel_author):
        if isinstance(novel_author, list):
            novel_author = novel_author[0].text
        return novel_author

    async def clean_novel_author_home_url(self, novel_author_home_url):
        if isinstance(novel_author_home_url, list):