How to use the apify.RequestList function in apify

To help you get started, we’ve selected a few apify examples, based on popular ways it is used in public projects.

github cermak-petr / actor-booking-scraper / src / main.js View on Github external
// check if attribute is an Array
        if (!Array.isArray(input.startUrls)) {
            throw new Error('INPUT.startUrls must an array!');
        }
        // convert any inconsistencies to correct format
        for (let i = 0; i < input.startUrls.length; i++) {
            let request = input.startUrls[i];
            if (typeof request === 'string') { request = { url: request }; }
            if ((!request.userData || !request.userData.label !== 'detail') && request.url.indexOf('/hotel/') > -1) {
                request.userData = { label: 'detail' };
            }
            request.url = addUrlParameters(request.url, input);
            input.startUrls[i] = request;
        }
        // create RequestList and reference startUrl
        requestList = new Apify.RequestList({ sources: input.startUrls });
        startUrl = addUrlParameters('https://www.booking.com/searchresults.html?dest_type=city;ss=paris&order=bayesian_review_score', input);
        await requestList.initialize();
    } else {
        // Create startURL based on provided INPUT.
        const dType = input.destType || 'city';
        const query = encodeURIComponent(input.search);
        startUrl = `https://www.booking.com/searchresults.html?dest_type=${dType};ss=${query}&order=${sortBy}`;
        startUrl = addUrlParameters(startUrl, input);

        // Enqueue all pagination pages.
        startUrl += '&rows=20';
        console.log(`startUrl: ${startUrl}`);
        await requestQueue.addRequest(new Apify.Request({url: startUrl, userData: {label: 'start'}}));
        if(!input.useFilters && input.propertyType == 'none' && input.minMaxPrice == 'none' && input.maxPages){
            for(let i = 1; i <= input.maxPages; i++){
                await requestQueue.addRequest(new Apify.Request({
github apifytech / apify-js / examples / screenshots.js View on Github external
Apify.main(async () => {
    // Read the actor input configuration containing the URLs for the screenshot.
    // By convention, the input is present in the actor's default key-value store under the "INPUT" key.
    const input = await Apify.getInput();
    if (!input) throw new Error('Have you passed the correct INPUT ?');

    const { sources } = input;

    const requestList = new Apify.RequestList({ sources });
    await requestList.initialize();

    const crawler = new Apify.PuppeteerCrawler({
        requestList,
        handlePageFunction: async ({ page, request }) => {
            console.log(`Processing ${request.url}...`);

            // This is a Puppeteer function that takes a screenshot of the page and returns its buffer.
            const screenshotBuffer = await page.screenshot();

            // The record key may only include the following characters: a-zA-Z0-9!-_.'()
            const key = request.url.replace(/[:/]/g, '_');

            // Save the screenshot. Choosing the right content type will automatically
            // assign the local file the right extension, in this case .png.
            // The screenshots will be stored in ./apify_storage/key_value_stores/default/
github apifytech / apify-js / examples / cheerio_crawler.js View on Github external
Apify.main(async () => {
    // Create an instance of the RequestList class that contains a list of URLs to crawl.
    // Here we download and parse the list of URLs from an external file.
    const requestList = new Apify.RequestList({
        sources: [{ requestsFromUrl: CSV_LINK }],
    });
    await requestList.initialize();

    // Create an instance of the CheerioCrawler class - a crawler
    // that automatically loads the URLs and parses their HTML using the cheerio library.
    const crawler = new Apify.CheerioCrawler({
        // Let the crawler fetch URLs from our list.
        requestList,

        // The crawler downloads and processes the web pages in parallel, with a concurrency
        // automatically managed based on the available system memory and CPU (see AutoscaledPool class).
        // Here we define some hard limits for the concurrency.
        minConcurrency: 10,
        maxConcurrency: 50,
github apifytech / apify-js / examples / example_basic_crawler.js View on Github external
Apify.main(async () => {
    // Create a request list.
    const requestList = new Apify.RequestList({
        sources: [
            { url: 'http://www.example.com' },
            { url: 'http://www.example.com/?page=2' },
            { url: 'http://www.example.com/?page=3' },
            { url: 'http://www.example.com/?page=4' },
            { url: 'http://www.example.com/?page=5' },
        ],
    });

    await requestList.initialize();

    const crawler = new Apify.BasicCrawler({
        requestList,

        // This page is executed for each request.
        // If request failes then it's retried 3 times.
github apifytech / apify-js / examples / example_puppeteer_crawler.js View on Github external
Apify.main(async () => {
    // Create a request list.
    const requestList = new Apify.RequestList({
        sources: [
            { url: 'http://www.example.com' },
            { url: 'http://www.some-nonexisting-domain.com' },
        ],
    });

    await requestList.initialize();

    const crawler = new Apify.PuppeteerCrawler({
        requestList,
        disableProxy: true,

        // This page is executed for each request.
        // If request failes then it's retried 3 times.
        // Parameter page is Puppeteers page object with loaded page.
        handlePageFunction: async ({ page, request }) => {
github apifytech / apify-js / examples / basic_crawler.js View on Github external
Apify.main(async () => {
    // Create and initialize an instance of the RequestList class that contains
    // a list of URLs to crawl. Here we use just a few hard-coded URLs.
    const requestList = new Apify.RequestList({
        sources: [
            { url: 'http://www.google.com/' },
            { url: 'http://www.example.com/' },
            { url: 'http://www.bing.com/' },
            { url: 'http://www.wikipedia.com/' },
        ],
    });
    await requestList.initialize();

    // Create a BasicCrawler - the simplest crawler that enables
    // users to implement the crawling logic themselves.
    const crawler = new Apify.BasicCrawler({

        // Let the crawler fetch URLs from our list.
        requestList,
github apifytech / apify-js / examples / puppeteer_sitemap.js View on Github external
Apify.main(async () => {
    const requestList = new Apify.RequestList({
        sources: [{ requestsFromUrl: 'https://edition.cnn.com/sitemaps/cnn/news.xml' }],
    });
    await requestList.initialize();

    const crawler = new Apify.PuppeteerCrawler({
        requestList,
        handlePageFunction: async ({ page, request }) => {
            console.log(`Processing ${request.url}...`);
            await Apify.pushData({
                url: request.url,
                title: await page.title(),
                html: await page.content(),
            });
        },
    });

apify

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

Apache-2.0
Latest version published 2 months ago

Package Health Score

84 / 100
Full package analysis