Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# -*- coding: utf-8 -*-
import json
from icrawler import Crawler, Feeder, Parser, ImageDownloader
from icrawler.builtin.filter import Filter
class BaiduFeeder(Feeder):
def get_filter(self):
search_filter = Filter()
# type filter
type_code = {
'portrait': 's=3&lm=0&st=-1&face=0',
'face': 's=0&lm=0&st=-1&face=1',
'clipart': 's=0&lm=0&st=1&face=0',
'linedrawing': 's=0&lm=0&st=2&face=0',
'animated': 's=0&lm=6&st=-1&face=0',
'static': 's=0&lm=7&st=-1&face=0'
}
def format_type(img_type):
return type_code[img_type]
# -*- coding: utf-8 -*-
import re
import six
from bs4 import BeautifulSoup
from six.moves import html_parser
from icrawler import Crawler, Parser, Feeder, ImageDownloader
from icrawler.builtin.filter import Filter
class BingFeeder(Feeder):
def get_filter(self):
search_filter = Filter()
# type filter
def format_type(img_type):
prefix = '+filterui:photo-'
return (prefix + 'animatedgif'
if img_type == 'animated' else prefix + img_type)
type_choices = [
'photo', 'clipart', 'linedrawing', 'transparent', 'animated'
]
search_filter.add_rule('type', format_type, type_choices)
# color filter
# -*- coding: utf-8 -*-
import json
from bs4 import BeautifulSoup
from six.moves.urllib.parse import urlencode
from icrawler import Crawler, Feeder, Parser, ImageDownloader
class GoogleFeeder(Feeder):
def feed(self, keyword, offset, max_num, date_min, date_max):
base_url = 'https://www.google.com/search?site=imghp&tbm=isch&source=hp&'
for i in range(offset, offset + max_num, 100):
cd_min = date_min.strftime('%d/%m/%Y') if date_min else ''
cd_max = date_max.strftime('%d/%m/%Y') if date_max else ''
tbs = 'cdr:1,cd_min:{},cd_max:{}'.format(cd_min, cd_max)
params = dict(
q=keyword, ijn=int(i / 100), start=i, tbs=tbs, tbm='isch')
url = base_url + urlencode(params)
self.out_queue.put(url)
self.logger.debug('put url to url_queue: {}'.format(url))
class GoogleParser(Parser):
def __init__(self,
feeder_cls=Feeder,
parser_cls=Parser,
downloader_cls=Downloader,
feeder_threads=1,
parser_threads=1,
downloader_threads=1,
storage={
'backend': 'FileSystem',
'root_dir': 'images'
},
log_level=logging.INFO,
extra_feeder_args=None,
extra_parser_args=None,
extra_downloader_args=None):
"""Init components with class names and other arguments.
Args:
# -*- coding: utf-8 -*-
import re
import time
from bs4 import BeautifulSoup
from six.moves.urllib.parse import urljoin, urlsplit
from icrawler import Crawler, Feeder, Parser, ImageDownloader
class GreedyFeeder(Feeder):
def feed(self, domains):
for domain in domains:
self.output(domain)
while not self.signal.get('reach_max_num'):
time.sleep(1)
class GreedyParser(Parser):
def __init__(self, *args, **kwargs):
self.pattern = re.compile(
r'(http|\/\/)(.*)\.(jpg|jpeg|png|bmp|gif|tiff)')
super(GreedyParser, self).__init__(*args, **kwargs)
def is_in_domain(self, url, domains):
# -*- coding: utf-8 -*-
import datetime
import json
from bs4 import BeautifulSoup
from six.moves.urllib.parse import urlencode
from icrawler import Crawler, Feeder, Parser, ImageDownloader
from icrawler.builtin.filter import Filter
class GoogleFeeder(Feeder):
def get_filter(self):
search_filter = Filter()
# type filter
def format_type(img_type):
return ('itp:lineart'
if img_type == 'linedrawing' else 'itp:' + img_type)
type_choices = ['photo', 'face', 'clipart', 'linedrawing', 'animated']
search_filter.add_rule('type', format_type, type_choices)
# color filter
def format_color(color):
if color in ['color', 'blackandwhite', 'transparent']:
code = {
# -*- coding: utf-8 -*-
import datetime
import json
import math
import os
from six.moves.urllib.parse import urlencode
from icrawler import Crawler, Feeder, Parser, ImageDownloader
class FlickrFeeder(Feeder):
def feed(self, apikey, max_num=4000, **kwargs):
if max_num > 4000:
max_num = 4000
self.logger.warning(
'max_num exceeds 4000, set it to 4000 automatically.')
base_url = 'https://api.flickr.com/services/rest/?'
params = {
'method': 'flickr.photos.search',
'api_key': apikey,
'format': 'json',
'nojsoncallback': 1
}
for key in kwargs:
if key in ['user_id', 'tags', 'tag_mode', 'text', 'license',
'sort', 'privacy_filter', 'accuracy', 'safe_search',