How to use the icrawler.Feeder function in icrawler

To help you get started, we’ve selected a few icrawler examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github hellock / icrawler / icrawler / builtin / baidu.py View on Github external
# -*- coding: utf-8 -*-

import json

from icrawler import Crawler, Feeder, Parser, ImageDownloader
from icrawler.builtin.filter import Filter


class BaiduFeeder(Feeder):

    def get_filter(self):
        search_filter = Filter()

        # type filter
        type_code = {
            'portrait': 's=3&lm=0&st=-1&face=0',
            'face': 's=0&lm=0&st=-1&face=1',
            'clipart': 's=0&lm=0&st=1&face=0',
            'linedrawing': 's=0&lm=0&st=2&face=0',
            'animated': 's=0&lm=6&st=-1&face=0',
            'static': 's=0&lm=7&st=-1&face=0'
        }

        def format_type(img_type):
            return type_code[img_type]
github hellock / icrawler / icrawler / builtin / bing.py View on Github external
# -*- coding: utf-8 -*-

import re

import six
from bs4 import BeautifulSoup
from six.moves import html_parser

from icrawler import Crawler, Parser, Feeder, ImageDownloader
from icrawler.builtin.filter import Filter


class BingFeeder(Feeder):

    def get_filter(self):
        search_filter = Filter()

        # type filter
        def format_type(img_type):
            prefix = '+filterui:photo-'
            return (prefix + 'animatedgif'
                    if img_type == 'animated' else prefix + img_type)

        type_choices = [
            'photo', 'clipart', 'linedrawing', 'transparent', 'animated'
        ]
        search_filter.add_rule('type', format_type, type_choices)

        # color filter
github hellock / icrawler / icrawler / builtin / google.py View on Github external
# -*- coding: utf-8 -*-

import json

from bs4 import BeautifulSoup
from six.moves.urllib.parse import urlencode

from icrawler import Crawler, Feeder, Parser, ImageDownloader


class GoogleFeeder(Feeder):

    def feed(self, keyword, offset, max_num, date_min, date_max):
        base_url = 'https://www.google.com/search?site=imghp&tbm=isch&source=hp&'
        for i in range(offset, offset + max_num, 100):
            cd_min = date_min.strftime('%d/%m/%Y') if date_min else ''
            cd_max = date_max.strftime('%d/%m/%Y') if date_max else ''
            tbs = 'cdr:1,cd_min:{},cd_max:{}'.format(cd_min, cd_max)
            params = dict(
                q=keyword, ijn=int(i / 100), start=i, tbs=tbs, tbm='isch')
            url = base_url + urlencode(params)
            self.out_queue.put(url)
            self.logger.debug('put url to url_queue: {}'.format(url))


class GoogleParser(Parser):
github hellock / icrawler / icrawler / crawler.py View on Github external
def __init__(self,
                 feeder_cls=Feeder,
                 parser_cls=Parser,
                 downloader_cls=Downloader,
                 feeder_threads=1,
                 parser_threads=1,
                 downloader_threads=1,
                 storage={
                     'backend': 'FileSystem',
                     'root_dir': 'images'
                 },
                 log_level=logging.INFO,
                 extra_feeder_args=None,
                 extra_parser_args=None,
                 extra_downloader_args=None):
        """Init components with class names and other arguments.

        Args:
github hellock / icrawler / icrawler / builtin / greedy.py View on Github external
# -*- coding: utf-8 -*-

import re
import time

from bs4 import BeautifulSoup
from six.moves.urllib.parse import urljoin, urlsplit

from icrawler import Crawler, Feeder, Parser, ImageDownloader


class GreedyFeeder(Feeder):

    def feed(self, domains):
        for domain in domains:
            self.output(domain)
        while not self.signal.get('reach_max_num'):
            time.sleep(1)


class GreedyParser(Parser):

    def __init__(self, *args, **kwargs):
        self.pattern = re.compile(
            r'(http|\/\/)(.*)\.(jpg|jpeg|png|bmp|gif|tiff)')
        super(GreedyParser, self).__init__(*args, **kwargs)

    def is_in_domain(self, url, domains):
github hellock / icrawler / icrawler / builtin / google.py View on Github external
# -*- coding: utf-8 -*-

import datetime
import json

from bs4 import BeautifulSoup
from six.moves.urllib.parse import urlencode

from icrawler import Crawler, Feeder, Parser, ImageDownloader
from icrawler.builtin.filter import Filter


class GoogleFeeder(Feeder):

    def get_filter(self):
        search_filter = Filter()

        # type filter
        def format_type(img_type):
            return ('itp:lineart'
                    if img_type == 'linedrawing' else 'itp:' + img_type)

        type_choices = ['photo', 'face', 'clipart', 'linedrawing', 'animated']
        search_filter.add_rule('type', format_type, type_choices)

        # color filter
        def format_color(color):
            if color in ['color', 'blackandwhite', 'transparent']:
                code = {
github hellock / icrawler / icrawler / builtin / flickr.py View on Github external
# -*- coding: utf-8 -*-

import datetime
import json
import math
import os

from six.moves.urllib.parse import urlencode

from icrawler import Crawler, Feeder, Parser, ImageDownloader


class FlickrFeeder(Feeder):

    def feed(self, apikey, max_num=4000, **kwargs):
        if max_num > 4000:
            max_num = 4000
            self.logger.warning(
                'max_num exceeds 4000, set it to 4000 automatically.')
        base_url = 'https://api.flickr.com/services/rest/?'
        params = {
            'method': 'flickr.photos.search',
            'api_key': apikey,
            'format': 'json',
            'nojsoncallback': 1
        }
        for key in kwargs:
            if key in ['user_id', 'tags', 'tag_mode', 'text', 'license',
                       'sort', 'privacy_filter', 'accuracy', 'safe_search',