Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if len(tmp) > 1 and tmp[-1] not in [
'html', 'shtml', 'shtm', 'php', 'jsp', 'asp'
]:
continue
# discard urls such as 'javascript:void(0)'
elif href.find('javascript', 0, 10) == 0:
continue
# discard urls such as 'android-app://xxxxxxxxx'
elif urlsplit(href).scheme not in ['http', 'https', 'ftp']:
continue
# urls of the same domain
elif self.is_in_domain(href, domains):
yield href
class GreedyImageCrawler(Crawler):
def __init__(self,
feeder_cls=GreedyFeeder,
parser_cls=GreedyParser,
downloader_cls=ImageDownloader,
*args,
**kwargs):
super(GreedyImageCrawler, self).__init__(
feeder_cls, parser_cls, downloader_cls, *args, **kwargs)
def crawl(self,
domains,
max_num=0,
min_size=None,
max_size=None,
file_idx_offset=0):
from icrawler.storage import BaseStorage
from io import BytesIO
class GoogleStorage(BaseStorage):
"""Google Storage backend.
The id is filename and data is stored as text files or binary files.
The root_dir is the bucket address such as gs:///.
"""
def __init__(self, root_dir):
try:
from google.cloud import storage
except ImportError:
print('GoogleStorage backend requires the package '
'"google-cloud-storage", execute '
'"pip install google-cloud-storage" to install it.')
self.client = storage.Client()
bucket_str = root_dir[5:].split('/')[0]
def set_storage(self, storage):
"""Set storage backend for downloader
For full list of storage backend supported, please see :mod:`storage`.
Args:
storage (dict or BaseStorage): storage backend configuration or instance
"""
if isinstance(storage, BaseStorage):
self.storage = storage
elif isinstance(storage, dict):
if 'backend' not in storage and 'root_dir' in storage:
storage['backend'] = 'FileSystem'
try:
backend_cls = getattr(storage_package, storage['backend'])
except AttributeError:
try:
backend_cls = import_module(storage['backend'])
except ImportError:
self.logger.error('cannot find backend module %s',
storage['backend'])
sys.exit()
kwargs = storage.copy()
del kwargs['backend']
self.storage = backend_cls(**kwargs)
return search_filter
def feed(self, keyword, offset, max_num, filters=None):
base_url = ('http://image.baidu.com/search/acjson?tn=resultjson_com'
'&ipn=rj&word={}&pn={}&rn=30')
self.filter = self.get_filter()
filter_str = self.filter.apply(filters, sep='&')
for i in range(offset, offset + max_num, 30):
url = base_url.format(keyword, i)
if filter_str:
url += '&' + filter_str
self.out_queue.put(url)
self.logger.debug('put url to url_queue: {}'.format(url))
class BaiduParser(Parser):
def _decode_url(self, encrypted_url):
url = encrypted_url
map1 = {'_z2C$q': ':', '_z&e3B': '.', 'AzdH3F': '/'}
map2 = {
'w': 'a', 'k': 'b', 'v': 'c', '1': 'd', 'j': 'e',
'u': 'f', '2': 'g', 'i': 'h', 't': 'i', '3': 'j',
'h': 'k', 's': 'l', '4': 'm', 'g': 'n', '5': 'o',
'r': 'p', 'q': 'q', '6': 'r', 'f': 's', 'p': 't',
'7': 'u', 'e': 'v', 'o': 'w', '8': '1', 'd': '2',
'n': '3', '9': '4', 'c': '5', 'm': '6', '0': '7',
'b': '8', 'l': '9', 'a': '0'
} # yapf: disable
for (ciphertext, plaintext) in map1.items():
url = url.replace(ciphertext, plaintext)
char_list = [char for char in url]
def __init__(self,
feeder_cls=Feeder,
parser_cls=Parser,
downloader_cls=Downloader,
feeder_threads=1,
parser_threads=1,
downloader_threads=1,
storage={
'backend': 'FileSystem',
'root_dir': 'images'
},
log_level=logging.INFO,
extra_feeder_args=None,
extra_parser_args=None,
extra_downloader_args=None):
"""Init components with class names and other arguments.
Args:
feeder_cls: class of feeder
# -*- coding: utf-8 -*-
import re
from bs4 import BeautifulSoup
from six.moves import html_parser
from icrawler import Crawler, Parser, SimpleSEFeeder, ImageDownloader
class BingParser(Parser):
def parse(self, response):
soup = BeautifulSoup(response.content, 'lxml')
image_divs = soup.find_all('div', class_='imgpt')
pattern = re.compile(r'murl\":\"(.*?)\.jpg')
for div in image_divs:
href_str = html_parser.HTMLParser().unescape(div.a['m'])
match = pattern.search(href_str)
if match:
img_url = '{}.jpg'.format(match.group(1))
yield dict(file_url=img_url)
class BingImageCrawler(Crawler):
def __init__(self,
return search_filter
def feed(self, keyword, offset, max_num, filters=None):
base_url = 'https://www.bing.com/images/async?q={}&first={}'
self.filter = self.get_filter()
filter_str = self.filter.apply(filters)
filter_str = '&qft=' + filter_str if filter_str else ''
for i in range(offset, offset + max_num, 20):
url = base_url.format(keyword, i) + filter_str
self.out_queue.put(url)
self.logger.debug('put url to url_queue: {}'.format(url))
class BingParser(Parser):
def parse(self, response):
soup = BeautifulSoup(
response.content.decode('utf-8', 'ignore'), 'lxml')
image_divs = soup.find_all('div', class_='imgpt')
pattern = re.compile(r'murl\":\"(.*?)\.jpg')
for div in image_divs:
href_str = html_parser.HTMLParser().unescape(div.a['m'])
match = pattern.search(href_str)
if match:
name = (match.group(1)
if six.PY3 else match.group(1).encode('utf-8'))
img_url = '{}.jpg'.format(name)
yield dict(file_url=img_url)
class GoogleFeeder(Feeder):
def feed(self, keyword, offset, max_num, date_min, date_max):
base_url = 'https://www.google.com/search?site=imghp&tbm=isch&source=hp&'
for i in range(offset, offset + max_num, 100):
cd_min = date_min.strftime('%d/%m/%Y') if date_min else ''
cd_max = date_max.strftime('%d/%m/%Y') if date_max else ''
tbs = 'cdr:1,cd_min:{},cd_max:{}'.format(cd_min, cd_max)
params = dict(
q=keyword, ijn=int(i / 100), start=i, tbs=tbs, tbm='isch')
url = base_url + urlencode(params)
self.out_queue.put(url)
self.logger.debug('put url to url_queue: {}'.format(url))
class GoogleParser(Parser):
def parse(self, response):
soup = BeautifulSoup(response.content, 'lxml')
image_divs = soup.find_all('div', class_='rg_meta')
for div in image_divs:
meta = json.loads(div.text)
if 'ou' in meta:
yield dict(file_url=meta['ou'])
class GoogleImageCrawler(Crawler):
def __init__(self,
feeder_cls=GoogleFeeder,
parser_cls=GoogleParser,
downloader_cls=ImageDownloader,
filter_str = self.filter.apply(filters, sep=',')
for i in range(offset, offset + max_num, 100):
params = dict(
q=keyword,
ijn=int(i / 100),
start=i,
tbs=filter_str,
tbm='isch')
if language:
params['lr'] = 'lang_' + language
url = base_url + urlencode(params)
self.out_queue.put(url)
self.logger.debug('put url to url_queue: {}'.format(url))
class GoogleParser(Parser):
def parse(self, response):
soup = BeautifulSoup(
response.content.decode('utf-8', 'ignore'), 'lxml')
image_divs = soup.find_all('div', class_='rg_meta')
for div in image_divs:
meta = json.loads(div.text)
if 'ou' in meta:
yield dict(file_url=meta['ou'])
class GoogleImageCrawler(Crawler):
def __init__(self,
feeder_cls=GoogleFeeder,
parser_cls=GoogleParser,
for i in range(page, page + page_max):
if self.signal.get('reach_max_num'):
break
complete_url = '{}&page={}'.format(url, i)
while True:
try:
self.output(complete_url, block=False)
except:
if self.signal.get('reach_max_num'):
break
else:
break
self.logger.debug('put url to url_queue: {}'.format(complete_url))
class FlickrParser(Parser):
def parse(self, response, apikey, size_preference=None):
content = json.loads(response.content.decode('utf-8', 'ignore'))
if content['stat'] != 'ok':
return
photos = content['photos']['photo']
for photo in photos:
photo_id = photo['id']
base_url = 'https://api.flickr.com/services/rest/?'
params = {
'method': 'flickr.photos.getSizes',
'api_key': apikey,
'photo_id': photo_id,
'format': 'json',
'nojsoncallback': 1
}