How to use the comiccrawler.url.urljoin function in comiccrawler

To help you get started, we’ve selected a few comiccrawler examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github eight04 / ComicCrawler / comiccrawler / mods / xznj120.py View on Github external
show_js = grabhtml(urljoin(url, show_js_src))
	real_pic_fn = re.search(r'(function f_qTcms_Pic_curUrl_realpic[\s\S]+?)function', show_js).group(1)
	code = """
	{script}
	{real_pic_fn}
	function base64_decode(data) {{
		return Buffer.from(data, "base64").toString();
	}}
	// m.wuyouhui.net/template/wap1/css/d7s/js/show.20170501.js?20190506201115
	Buffer.from(qTcms_S_m_murl_e, "base64")
		.toString()
		.split("$qingtiandy$")
		.filter(u => !/^(--|\+)/.test(u))
		.map(f_qTcms_Pic_curUrl_realpic);
	""".format(script=script, real_pic_fn=real_pic_fn)
	return [urljoin(url, i) for i in eval(code)]
github eight04 / ComicCrawler / comiccrawler / mods / weibo.py View on Github external
def get_next_page(html, url):
	match = re.search('class="pgNext">
github eight04 / ComicCrawler / comiccrawler / mods / artstation.py View on Github external
def errorhandler(err, crawler):
	if not is_http(err, 403) or not crawler.ep.current_url:
		return
	match = re.search("artwork/([^/]+)", crawler.ep.current_url)
	if not match:
		return
	artwork = match.group(1)
	crawler.ep.current_url = urljoin(crawler.ep.current_url, "/projects/{}.json".format(artwork))
github eight04 / ComicCrawler / comiccrawler / mods / xznj120.py View on Github external
def get_episodes(html, url):
	s = []
	for match in re.finditer(r'<li><a href="([^">]+&gt;<p>([^&lt;]+)', html):
		ep_url, title = [unescape(t) for t in match.groups()]
		s.append(Episode(title, urljoin(url, ep_url)))
	return s[::-1]
	</p></a></li>
github eight04 / ComicCrawler / comiccrawler / mods / nijie.py View on Github external
def get_episodes(html, url):
	s = []
	ep_set = set()
	
	for m in re.finditer(r'
github eight04 / ComicCrawler / comiccrawler / mods / qq.py View on Github external
def get_images(html, url):
	data = re.search("var DATA\s*=\s*'[^']+'", html).group()
	nonce = re.search("window\.nonce = (.+)", html).group(1)
	nonce2 = re.search("window\[.+?=(.+)", html)
	nonce2 = nonce2.group(1) if nonce2 else None
	
	view_js = re.search('src="([^"]+?page\.chapter\.view[^"]+?\.js[^"]*)', html).group(1)
	view_js = grabhtml(urljoin(url, view_js))
	view_js = re.search("(eval\(.+?)\}\(\)", view_js, re.DOTALL).group(1)
	
	code = "\n".join([
		data,
		"""
		function createDummy() {
			return new Proxy(() => true, {
				get: () => createDummy()
			});
		}
		const window = document = createDummy();
		""",
		"const nonce = {};".format(nonce2 or nonce),
		"const W = {DATA, nonce};",
		view_js
	])
github eight04 / ComicCrawler / comiccrawler / mods / artstation.py View on Github external
def get_next_page(html, url):
	if is_project(url):
		page = int(parse_qs(urlparse(url).query)["page"][0])
		total_page = math.ceil(json.loads(html)["total_count"] / EP_PER_PAGE)
		return update_qs(url, {"page": page + 1}) if page &lt; total_page else None
			
	if is_user_home(url):
		user = re.search("www\.artstation\.com/([^/]+)", url).group(1)
		return urljoin(url, "/users/{user}/projects.json?page=1".format(user=user))
github eight04 / ComicCrawler / comiccrawler / mods / senmanga.py View on Github external
def get_episodes(html, url):
	prefix = re.escape(url)
	s = []
	for m in re.finditer(r'<a href="({}/[^/">]*&gt;([^&lt;]+)'.format(prefix), html):
		ep_url, title = m.groups()
		s.append(Episode(title, urljoin(url, ep_url)))
	return s[::-1]
</a>
github eight04 / ComicCrawler / comiccrawler / mods / nijie.py View on Github external
def get_next_page(html, url):
	match = re.search(r'
github eight04 / ComicCrawler / comiccrawler / mods / facebook.py View on Github external
def get_images(html, url):
	fbset, fbid = get_url_info(url)
	fb_dtsg = re.search('name="fb_dtsg" value="([^"]+)', html).group(1)
	# fb_dtsg = re.search('"DTSGInitialData".*?"token":"([^"]+?)', html).group(1)
	response = grabhtml(
		"https://www.facebook.com/ajax/photos/snowlift/menu/",
		params={"fbid": fbid, "set": fbset},
		method="POST",
		data={"__a": 1, "fb_dtsg": fb_dtsg}
	)
	# with open("test.js", "w") as f:
		# f.write(response)
	download_url = re.search('"download_photo","href":(.+?),"', response).group(1)
	download_url = json.loads(download_url)
	return urljoin(url, download_url)