How to use the icrawler.Crawler function in icrawler

To help you get started, we’ve selected a few icrawler examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github hellock / icrawler / icrawler / builtin / greedy.py View on Github external
if len(tmp) > 1 and tmp[-1] not in [
                        'html', 'shtml', 'shtm', 'php', 'jsp', 'asp'
                ]:
                    continue
                # discard urls such as 'javascript:void(0)'
                elif href.find('javascript', 0, 10) == 0:
                    continue
                # discard urls such as 'android-app://xxxxxxxxx'
                elif urlsplit(href).scheme not in ['http', 'https', 'ftp']:
                    continue
                # urls of the same domain
                elif self.is_in_domain(href, domains):
                    yield href


class GreedyImageCrawler(Crawler):

    def __init__(self,
                 feeder_cls=GreedyFeeder,
                 parser_cls=GreedyParser,
                 downloader_cls=ImageDownloader,
                 *args,
                 **kwargs):
        super(GreedyImageCrawler, self).__init__(
            feeder_cls, parser_cls, downloader_cls, *args, **kwargs)

    def crawl(self,
              domains,
              max_num=0,
              min_size=None,
              max_size=None,
              file_idx_offset=0):
github hellock / icrawler / icrawler / builtin / bing.py View on Github external
def parse(self, response):
        soup = BeautifulSoup(
            response.content.decode('utf-8', 'ignore'), 'lxml')
        image_divs = soup.find_all('div', class_='imgpt')
        pattern = re.compile(r'murl\":\"(.*?)\.jpg')
        for div in image_divs:
            href_str = html_parser.HTMLParser().unescape(div.a['m'])
            match = pattern.search(href_str)
            if match:
                name = (match.group(1)
                        if six.PY3 else match.group(1).encode('utf-8'))
                img_url = '{}.jpg'.format(name)
                yield dict(file_url=img_url)


class BingImageCrawler(Crawler):

    def __init__(self,
                 feeder_cls=BingFeeder,
                 parser_cls=BingParser,
                 downloader_cls=ImageDownloader,
                 *args,
                 **kwargs):
        super(BingImageCrawler, self).__init__(feeder_cls, parser_cls,
                                               downloader_cls, *args, **kwargs)

    def crawl(self,
              keyword,
              filters=None,
              offset=0,
              max_num=1000,
              min_size=None,
github hellock / icrawler / icrawler / builtin / google.py View on Github external
self.logger.debug('put url to url_queue: {}'.format(url))


class GoogleParser(Parser):

    def parse(self, response):
        soup = BeautifulSoup(
            response.content.decode('utf-8', 'ignore'), 'lxml')
        image_divs = soup.find_all('div', class_='rg_meta')
        for div in image_divs:
            meta = json.loads(div.text)
            if 'ou' in meta:
                yield dict(file_url=meta['ou'])


class GoogleImageCrawler(Crawler):

    def __init__(self,
                 feeder_cls=GoogleFeeder,
                 parser_cls=GoogleParser,
                 downloader_cls=ImageDownloader,
                 *args,
                 **kwargs):
        super(GoogleImageCrawler, self).__init__(
            feeder_cls, parser_cls, downloader_cls, *args, **kwargs)

    def crawl(self,
              keyword,
              filters=None,
              offset=0,
              max_num=1000,
              min_size=None,
github hellock / icrawler / icrawler / builtin / baidu.py View on Github external
content = response.content.decode('utf-8', 'ignore')
            content = json.loads(content, strict=False)
        except:
            self.logger.error('Fail to parse the response in json format')
            return
        for item in content['data']:
            if 'objURL' in item:
                img_url = self._decode_url(item['objURL'])
            elif 'hoverURL' in item:
                img_url = item['hoverURL']
            else:
                continue
            yield dict(file_url=img_url)


class BaiduImageCrawler(Crawler):

    def __init__(self,
                 feeder_cls=BaiduFeeder,
                 parser_cls=BaiduParser,
                 downloader_cls=ImageDownloader,
                 *args,
                 **kwargs):
        super(BaiduImageCrawler, self).__init__(
            feeder_cls, parser_cls, downloader_cls, *args, **kwargs)

    def crawl(self,
              keyword,
              filters=None,
              offset=0,
              max_num=1000,
              min_size=None,
github hellock / icrawler / icrawler / builtin / flickr.py View on Github external
continue
            else:
                if info['stat'] == 'ok':
                    urls = {
                        item['label'].lower(): item['source']
                        for item in info['sizes']['size']
                    }
                else:
                    continue
                for sz in size_preference:
                    if sz in urls:
                        yield dict(file_url=urls[sz], meta=photo)
                        break


class FlickrImageCrawler(Crawler):

    def __init__(self,
                 apikey=None,
                 feeder_cls=FlickrFeeder,
                 parser_cls=FlickrParser,
                 downloader_cls=ImageDownloader,
                 *args,
                 **kwargs):
        if apikey is None:
            apikey = os.getenv('FLICKR_APIKEY')
            if not apikey:
                raise RuntimeError('apikey is not specified')
        self.apikey = apikey
        super(FlickrImageCrawler, self).__init__(
            feeder_cls, parser_cls, downloader_cls, *args, **kwargs)
github hellock / icrawler / icrawler / builtin / google.py View on Github external
self.out_queue.put(url)
            self.logger.debug('put url to url_queue: {}'.format(url))


class GoogleParser(Parser):

    def parse(self, response):
        soup = BeautifulSoup(response.content, 'lxml')
        image_divs = soup.find_all('div', class_='rg_meta')
        for div in image_divs:
            meta = json.loads(div.text)
            if 'ou' in meta:
                yield dict(file_url=meta['ou'])


class GoogleImageCrawler(Crawler):

    def __init__(self,
                 feeder_cls=GoogleFeeder,
                 parser_cls=GoogleParser,
                 downloader_cls=ImageDownloader,
                 *args,
                 **kwargs):
        super(GoogleImageCrawler, self).__init__(
            feeder_cls, parser_cls, downloader_cls, *args, **kwargs)

    def crawl(self,
              keyword,
              offset=0,
              max_num=1000,
              date_min=None,
              date_max=None,
github hellock / icrawler / icrawler / builtin / bing.py View on Github external
class BingParser(Parser):

    def parse(self, response):
        soup = BeautifulSoup(response.content, 'lxml')
        image_divs = soup.find_all('div', class_='imgpt')
        pattern = re.compile(r'murl\":\"(.*?)\.jpg')
        for div in image_divs:
            href_str = html_parser.HTMLParser().unescape(div.a['m'])
            match = pattern.search(href_str)
            if match:
                img_url = '{}.jpg'.format(match.group(1))
                yield dict(file_url=img_url)


class BingImageCrawler(Crawler):

    def __init__(self,
                 feeder_cls=SimpleSEFeeder,
                 parser_cls=BingParser,
                 downloader_cls=ImageDownloader,
                 *args,
                 **kwargs):
        super(BingImageCrawler, self).__init__(feeder_cls, parser_cls,
                                               downloader_cls, *args, **kwargs)

    def crawl(self,
              keyword,
              offset=0,
              max_num=1000,
              min_size=None,
              max_size=None,
github hellock / icrawler / icrawler / builtin / urllist.py View on Github external
threading.current_thread().name)
                    break
                else:
                    self.logger.info('%s is waiting for new page urls',
                                     threading.current_thread().name)
                    continue
            except Exception as e:
                self.logger.error('exception caught in thread %s: %s',
                                  threading.current_thread().name, e)
                continue
            else:
                self.logger.debug('start downloading page {}'.format(url))
            self.output({'file_url': url})


class UrlListCrawler(Crawler):

    def __init__(self,
                 feeder_cls=UrlListFeeder,
                 parser_cls=PseudoParser,
                 downloader_cls=ImageDownloader,
                 *args,
                 **kwargs):
        super(UrlListCrawler, self).__init__(feeder_cls, parser_cls,
                                             downloader_cls, *args, **kwargs)

    def crawl(self, url_list, max_num=1000, file_idx_offset=0,
              overwrite=False):
        feeder_kwargs = dict(url_list=url_list)
        downloader_kwargs = dict(
            file_idx_offset=file_idx_offset,
            max_num=max_num,