Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if len(tmp) > 1 and tmp[-1] not in [
'html', 'shtml', 'shtm', 'php', 'jsp', 'asp'
]:
continue
# discard urls such as 'javascript:void(0)'
elif href.find('javascript', 0, 10) == 0:
continue
# discard urls such as 'android-app://xxxxxxxxx'
elif urlsplit(href).scheme not in ['http', 'https', 'ftp']:
continue
# urls of the same domain
elif self.is_in_domain(href, domains):
yield href
class GreedyImageCrawler(Crawler):
def __init__(self,
feeder_cls=GreedyFeeder,
parser_cls=GreedyParser,
downloader_cls=ImageDownloader,
*args,
**kwargs):
super(GreedyImageCrawler, self).__init__(
feeder_cls, parser_cls, downloader_cls, *args, **kwargs)
def crawl(self,
domains,
max_num=0,
min_size=None,
max_size=None,
file_idx_offset=0):
def parse(self, response):
soup = BeautifulSoup(
response.content.decode('utf-8', 'ignore'), 'lxml')
image_divs = soup.find_all('div', class_='imgpt')
pattern = re.compile(r'murl\":\"(.*?)\.jpg')
for div in image_divs:
href_str = html_parser.HTMLParser().unescape(div.a['m'])
match = pattern.search(href_str)
if match:
name = (match.group(1)
if six.PY3 else match.group(1).encode('utf-8'))
img_url = '{}.jpg'.format(name)
yield dict(file_url=img_url)
class BingImageCrawler(Crawler):
def __init__(self,
feeder_cls=BingFeeder,
parser_cls=BingParser,
downloader_cls=ImageDownloader,
*args,
**kwargs):
super(BingImageCrawler, self).__init__(feeder_cls, parser_cls,
downloader_cls, *args, **kwargs)
def crawl(self,
keyword,
filters=None,
offset=0,
max_num=1000,
min_size=None,
self.logger.debug('put url to url_queue: {}'.format(url))
class GoogleParser(Parser):
def parse(self, response):
soup = BeautifulSoup(
response.content.decode('utf-8', 'ignore'), 'lxml')
image_divs = soup.find_all('div', class_='rg_meta')
for div in image_divs:
meta = json.loads(div.text)
if 'ou' in meta:
yield dict(file_url=meta['ou'])
class GoogleImageCrawler(Crawler):
def __init__(self,
feeder_cls=GoogleFeeder,
parser_cls=GoogleParser,
downloader_cls=ImageDownloader,
*args,
**kwargs):
super(GoogleImageCrawler, self).__init__(
feeder_cls, parser_cls, downloader_cls, *args, **kwargs)
def crawl(self,
keyword,
filters=None,
offset=0,
max_num=1000,
min_size=None,
content = response.content.decode('utf-8', 'ignore')
content = json.loads(content, strict=False)
except:
self.logger.error('Fail to parse the response in json format')
return
for item in content['data']:
if 'objURL' in item:
img_url = self._decode_url(item['objURL'])
elif 'hoverURL' in item:
img_url = item['hoverURL']
else:
continue
yield dict(file_url=img_url)
class BaiduImageCrawler(Crawler):
def __init__(self,
feeder_cls=BaiduFeeder,
parser_cls=BaiduParser,
downloader_cls=ImageDownloader,
*args,
**kwargs):
super(BaiduImageCrawler, self).__init__(
feeder_cls, parser_cls, downloader_cls, *args, **kwargs)
def crawl(self,
keyword,
filters=None,
offset=0,
max_num=1000,
min_size=None,
continue
else:
if info['stat'] == 'ok':
urls = {
item['label'].lower(): item['source']
for item in info['sizes']['size']
}
else:
continue
for sz in size_preference:
if sz in urls:
yield dict(file_url=urls[sz], meta=photo)
break
class FlickrImageCrawler(Crawler):
def __init__(self,
apikey=None,
feeder_cls=FlickrFeeder,
parser_cls=FlickrParser,
downloader_cls=ImageDownloader,
*args,
**kwargs):
if apikey is None:
apikey = os.getenv('FLICKR_APIKEY')
if not apikey:
raise RuntimeError('apikey is not specified')
self.apikey = apikey
super(FlickrImageCrawler, self).__init__(
feeder_cls, parser_cls, downloader_cls, *args, **kwargs)
self.out_queue.put(url)
self.logger.debug('put url to url_queue: {}'.format(url))
class GoogleParser(Parser):
def parse(self, response):
soup = BeautifulSoup(response.content, 'lxml')
image_divs = soup.find_all('div', class_='rg_meta')
for div in image_divs:
meta = json.loads(div.text)
if 'ou' in meta:
yield dict(file_url=meta['ou'])
class GoogleImageCrawler(Crawler):
def __init__(self,
feeder_cls=GoogleFeeder,
parser_cls=GoogleParser,
downloader_cls=ImageDownloader,
*args,
**kwargs):
super(GoogleImageCrawler, self).__init__(
feeder_cls, parser_cls, downloader_cls, *args, **kwargs)
def crawl(self,
keyword,
offset=0,
max_num=1000,
date_min=None,
date_max=None,
class BingParser(Parser):
def parse(self, response):
soup = BeautifulSoup(response.content, 'lxml')
image_divs = soup.find_all('div', class_='imgpt')
pattern = re.compile(r'murl\":\"(.*?)\.jpg')
for div in image_divs:
href_str = html_parser.HTMLParser().unescape(div.a['m'])
match = pattern.search(href_str)
if match:
img_url = '{}.jpg'.format(match.group(1))
yield dict(file_url=img_url)
class BingImageCrawler(Crawler):
def __init__(self,
feeder_cls=SimpleSEFeeder,
parser_cls=BingParser,
downloader_cls=ImageDownloader,
*args,
**kwargs):
super(BingImageCrawler, self).__init__(feeder_cls, parser_cls,
downloader_cls, *args, **kwargs)
def crawl(self,
keyword,
offset=0,
max_num=1000,
min_size=None,
max_size=None,
threading.current_thread().name)
break
else:
self.logger.info('%s is waiting for new page urls',
threading.current_thread().name)
continue
except Exception as e:
self.logger.error('exception caught in thread %s: %s',
threading.current_thread().name, e)
continue
else:
self.logger.debug('start downloading page {}'.format(url))
self.output({'file_url': url})
class UrlListCrawler(Crawler):
def __init__(self,
feeder_cls=UrlListFeeder,
parser_cls=PseudoParser,
downloader_cls=ImageDownloader,
*args,
**kwargs):
super(UrlListCrawler, self).__init__(feeder_cls, parser_cls,
downloader_cls, *args, **kwargs)
def crawl(self, url_list, max_num=1000, file_idx_offset=0,
overwrite=False):
feeder_kwargs = dict(url_list=url_list)
downloader_kwargs = dict(
file_idx_offset=file_idx_offset,
max_num=max_num,