Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_css_select():
field = TextField(css_select="head title")
value = field.extract(html_etree=html_etree)
assert value == "ruia"
def test_attr_field():
title = ruia.AttrField(css_select=".title", attr="href", default="Untitled")
assert title.extract(html_etree=html) == "/"
tags = ruia.AttrField(css_select=".tag", attr="href", default="No tag", many=True)
assert tags.extract(html_etree=html)[0] == "./easy.html"
def test_attr_field():
title = ruia.AttrField(css_select=".title", attr="href", default="Untitled")
assert title.extract(html_etree=html) == "/"
tags = ruia.AttrField(css_select=".tag", attr="href", default="No tag", many=True)
assert tags.extract(html_etree=html)[0] == "./easy.html"
def clean_title(self, title):
return "Title: " + title
class DoubanIgnoreItem(Item):
title = TextField(css_select="head title")
async def clean_title(self, title):
raise IgnoreThisItem
class HackerNewsItem(Item):
target_item = TextField(css_select="tr.athing")
title = TextField(css_select="a.storylink")
url = AttrField(css_select="a.storylink", attr="href")
async def parse_item(html):
items = []
async for item in DoubanItems.get_items(html=html):
items.append(item)
return items
async def error_parse_item(html):
items = []
async for item in DoubanItem.get_items(html=html):
items.append(item)
return items
def test_no_start_url_spider():
try:
class NoStartUrlSpider(Spider):
pass
NoStartUrlSpider.start()
except Exception as e:
assert isinstance(e, ValueError)
def test_callback_error():
class NoParseSpider(Spider):
start_urls = ["https://httpbin.org/get"]
NoParseSpider.start()
class CallbackError(Spider):
start_urls = ["https://httpbin.org/get"]
async def parse(self, response):
raise ValueError("error")
CallbackError.start()
def test_spider_hook_error():
class SpiderDemo(Spider):
start_urls = ["https://httpbin.org/get?p=0"]
async def parse(self, response):
pass
async def before_stop_func(spider_ins):
raise TypeError("error")
loop = asyncio.new_event_loop()
SpiderDemo.start(loop=loop, before_stop=before_stop_func)
def test_spider_hook():
async def after_start_func(spider_ins):
print("after_start_func")
spider_ins.result["after_start"] = True
assert isinstance(spider_ins.result, dict)
async def before_stop_func(spider_ins):
print("before_stop_func")
spider_ins.result["before_stop"] = True
class SpiderHook(Spider):
start_urls = ["https://httpbin.org/get?p=0", "https://httpbin.org/404"]
request_config = {"RETRIES": 1, "DELAY": 0, "TIMEOUT": 10}
result = {
"after_start": False,
"before_stop": False,
"process_succeed_response": False,
"process_failed_response": False,
"process_item": False,
}
async def parse(self, response):
item = await ItemDemo.get_item(html=HTML)
yield item
async def process_item(self, item):
class DoubanItem(Item):
title = TextField(css_select="head title")
constant_attr = "hello ruia"
async def clean_title(self, title):
return "Title: " + title
class DoubanCleanMethodErrorItem(Item):
title = TextField(css_select="head title")
def clean_title(self, title):
return "Title: " + title
class DoubanIgnoreItem(Item):
title = TextField(css_select="head title")
async def clean_title(self, title):
raise IgnoreThisItem
class HackerNewsItem(Item):
target_item = TextField(css_select="tr.athing")
title = TextField(css_select="a.storylink")
url = AttrField(css_select="a.storylink", attr="href")
async def parse_item(html):
items = []
async for item in DoubanItems.get_items(html=html):
items.append(item)
#!/usr/bin/env python
import asyncio
import os
from ruia import AttrField, Item, TextField
from ruia.exceptions import IgnoreThisItem, InvalidFuncType
html_path = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "data", "for_item_testing.html"
)
with open(html_path, mode="r", encoding="utf-8") as file:
HTML = file.read()
class DoubanItems(Item):
target_item = TextField(css_select="div.item")
title = TextField(css_select="span.title")
cover = AttrField(css_select="div.pic>a>img", attr="src")
abstract = TextField(css_select="span.inq")
async def clean_title(self, title):
if isinstance(title, str):
return title
else:
return "".join([i.text.strip().replace("\xa0", "") for i in title])
class DoubanItem(Item):
title = TextField(css_select="head title")
constant_attr = "hello ruia"