How to use the ruia.HtmlField function in ruia

To help you get started, we’ve selected a few ruia examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github howie6879 / ruia / tests / for_doc / apis / test_field.py View on Github external
def test_html_field():
    title = ruia.HtmlField(css_select=".title", default="Untitled")
    assert (
        title.extract(html_etree=html)
        == '<div href="/" class="title">Ruia Documentation</div>\n'
    )
    tags = ruia.HtmlField(css_select=".tag", default="No tag", many=True)
    assert (
        tags.extract(html_etree=html)[1]
        == '<li href="./fast.html" class="tag">fast</li>\n    '
    )
github howie6879 / ruia / tests / test_field.py View on Github external
def test_html_field():
    field_en = HtmlField(css_select="div.brand a")
    field_zh = HtmlField(css_select="div.brand p")
    assert (
        field_en.extract(html_etree=html_etree)
        == '<a href="https://github.com">Github</a>'
    )
    assert field_zh.extract(html_etree=html_etree) == "<p>你好</p>\n"
github howie6879 / ruia / tests / test_field.py View on Github external
def test_html_field_with_many():
    field = HtmlField(css_select="a.test_link", many=True)
    values = field.extract(html_etree=html_etree)
    assert len(values) == 5
    assert (
        values[0]
        == '<a href="https://github.com/howie6879/" class="test_link">hello1 github.</a>\n'
    )
    assert (
        values[4]
        == '<a href="https://github.com/howie6879/" class="test_link">hello5 github.</a>\n'
        "    Some text outside.\n"
github howie6879 / ruia / tests / for_doc / apis / test_field.py View on Github external
def test_html_field():
    title = ruia.HtmlField(css_select=".title", default="Untitled")
    assert (
        title.extract(html_etree=html)
        == '<div href="/" class="title">Ruia Documentation</div>\n'
    )
    tags = ruia.HtmlField(css_select=".tag", default="No tag", many=True)
    assert (
        tags.extract(html_etree=html)[1]
        == '<li href="./fast.html" class="tag">fast</li>\n    '
    )
github howie6879 / ruia / tests / test_field.py View on Github external
def test_html_field():
    field_en = HtmlField(css_select="div.brand a")
    field_zh = HtmlField(css_select="div.brand p")
    assert (
        field_en.extract(html_etree=html_etree)
        == '<a href="https://github.com">Github</a>'
    )
    assert field_zh.extract(html_etree=html_etree) == "<p>你好</p>\n"
github howie6879 / owllook / owllook / spiders / qidian_ranking.py View on Github external
#!/usr/bin/env python
import asyncio
import time

from ruia import Spider, Item, AttrField, HtmlField, TextField
from ruia_ua import middleware

from owllook.database.mongodb import MotorBaseOld


class RankingItem(Item):
    target_item = TextField(css_select='.rank-list')
    ranking_title = TextField(css_select='h3.wrap-title')
    more = AttrField(css_select='h3>a.more', attr='href')
    book_list = HtmlField(css_select='div.book-list>ul>li', many=True)

    async def clean_ranking_title(self, ranking_title):
        if isinstance(ranking_title, list):
            return ranking_title[0].text
        else:
            return str(ranking_title).split('榜')[0] + '榜'

    async def clean_more(self, more):
        return "https:" + more


class NameItem(Item):
    top_name = TextField(css_select='h4', default='')
    other_name = TextField(css_select='a.name', default='')
github howie6879 / owllook / owllook / spiders / zh_ranking.py View on Github external
"""
 Created by howie.hu at 29/11/2017.
"""
import time

from ruia import Spider, Item, AttrField, HtmlField, TextField
from ruia_ua import middleware

from owllook.database.mongodb import MotorBaseOld


class RankingItem(Item):
    target_item = TextField(css_select='div.rank_i_p_list')
    ranking_title = TextField(css_select='div.rank_i_p_tit')
    more = AttrField(css_select='div.rank_i_more a', attr='href')
    book_list = HtmlField(css_select='div.rank_i_p_list>div.rank_i_li', many=True)


class NameItem(Item):
    top_name = TextField(css_select='div.rank_i_bname a.rank_i_l_a_book', default='')
    other_name = TextField(css_select='div.rank_i_bname a', default='')


class ZHRankingSpider(Spider):
    start_urls = ['http://book.zongheng.com/rank.html']

    concurrency = 3

    async def parse(self, res):
        result = []
        res_dic = {}