How to use the icrawler.feeder.Feeder function in icrawler

To help you get started, we’ve selected a few icrawler examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github hellock / icrawler / icrawler / feeder.py View on Github external
"""Feed urls.

        This method should be implemented by users.
        """
        raise NotImplementedError

    def worker_exec(self, **kwargs):
        """Target function of workers"""
        self.feed(**kwargs)
        self.logger.info('thread {} exit'.format(current_thread().name))

    def __exit__(self):
        self.logger.info('all feeder threads exited')


class UrlListFeeder(Feeder):
    """Url list feeder which feed a list of urls"""

    def feed(self, url_list, offset=0, max_num=0):
        if isinstance(url_list, str):
            if osp.isfile(url_list):
                with open(url_list, 'r') as fin:
                    url_list = [line.rstrip('\n') for line in fin]
            else:
                raise IOError('url list file {} not found'.format(url_list))
        elif not isinstance(url_list, list):
            raise TypeError('"url_list" can only be a filename or a str list')

        if offset < 0 or offset >= len(url_list):
            raise ValueError('"offset" exceed the list length')
        else:
            if max_num > 0:
github hellock / icrawler / icrawler / feeder.py View on Github external
def __init__(self, thread_num, signal, session):
        """Init Feeder with some shared variables."""
        super(Feeder, self).__init__(
            thread_num=thread_num, in_queue=None, name='feeder')
        self.signal = signal
        self.session = session
github hellock / icrawler / icrawler / feeder.py View on Github external
raise TypeError('"url_list" can only be a filename or a str list')

        if offset < 0 or offset >= len(url_list):
            raise ValueError('"offset" exceed the list length')
        else:
            if max_num > 0:
                end_idx = min(len(url_list), offset + max_num)
            else:
                end_idx = len(url_list)
            for i in range(offset, end_idx):
                url = url_list[i]
                self.out_queue.put(url)
                self.logger.debug('put url to url_queue: {}'.format(url))


class SimpleSEFeeder(Feeder):
    """Simple search engine like Feeder"""

    def feed(self, url_template, keyword, offset, max_num, page_step):
        """Feed urls once

        Args:
            url_template: A string with parameters replaced with "{}".
            keyword: A string indicating the searching keyword.
            offset: An integer indicating the starting index.
            max_num: An integer indicating the max number of images to be crawled.
            page_step: An integer added to offset after each iteration.
        """
        for i in range(offset, offset + max_num, page_step):
            url = url_template.format(keyword, i)
            self.out_queue.put(url)
            self.logger.debug('put url to url_queue: {}'.format(url))