How to use the apify.pushData function in apify

To help you get started, we’ve selected a few apify examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github drobnikj / crawler-google-places / src / places_crawler.js View on Github external
try {
                // Check if Google shows captcha
                if (await page.$('form#captcha-form')) {
                    console.log('******\nGoogle shows captcha. This browser will be retired.\n******');
                    throw new Error('Needs to fill captcha!');
                }
                if (label === 'startUrl') {
                    log.info(`Start enqueuing places details for search: ${searchString}`);
                    await enqueueAllPlaceDetails(page, searchString, requestQueue, maxCrawledPlaces, request);
                    log.info('Enqueuing places finished.');
                } else {
                    // Get data for place and save it to dataset
                    log.info(`Extracting details from place url ${page.url()}`);
                    const placeDetail = await extractPlaceDetail(page, request, searchString, includeReviews, includeImages,
                        includeHistogram, includeOpeningHours, includePeopleAlsoSearch, launchPuppeteerOptions.proxyConfig);
                    await Apify.pushData(placeDetail);
                    log.info(`Finished place url ${placeDetail.url}`);
                }
            } catch(err) {
                // This issue can happen, mostly because proxy IP was blocked by google
                // Let's refresh IP using browser refresh.
                if (log.getLevel() === log.LEVELS.DEBUG) {
                    await saveHTML(page, `${request.id}.html`);
                    await saveScreenshot(page, `${request.id}.png`);
                }
                await puppeteerPool.retire(page.browser());
                if (request.retryCount < MAX_PAGE_RETRIES && log.getLevel() !== log.LEVELS.DEBUG) {
                    // This fix to not show stack trace in log for retired requests, but we should handle this on SDK
                    err.stack = 'Stack trace was omitted for retires requests. Set up debug mode to see it.';
                }
                throw err;
            }
github apifytech / apify-cli / src / templates / puppeteer_crawler / main.js View on Github external
// We're getting the title, rank and URL of each post on Hacker News.
                $posts.forEach(($post) => {
                    data.push({
                        title: $post.querySelector('.title a').innerText,
                        rank: $post.querySelector('.rank').innerText,
                        href: $post.querySelector('.title a').href,
                    });
                });

                return data;
            };
            const data = await page.$$eval('.athing', pageFunction);

            // Store the results to the default dataset.
            await Apify.pushData(data);

            // Find a link to the next page and enqueue it if it exists.
            const infos = await Apify.utils.enqueueLinks({
                page,
                requestQueue,
                selector: '.morelink',
            });

            if (infos.length === 0) console.log(`${request.url} is the last page!`);
        },
github cermak-petr / actor-booking-scraper / src / main.js View on Github external
console.log('extracting data...');
                    await Apify.setValue('page.html', await page.content(), {contentType: 'text/html'});
                    await Apify.utils.puppeteer.injectJQuery(page);
                    const result = await page.evaluate(listPageFunction, input);
                    console.log('Found ' + result.length + ' results');
                    if (result.length > 0) {
                        const toBeAdded = [];
                        for (const item of result) {
                            item.url = addUrlParameters(item.url, input);
                            if (!state.crawled[item.name]) {
                                toBeAdded.push(item);
                                state.crawled[item.name] = true;
                            }
                        }
                        if (migrating) { await Apify.setValue('STATE', state); }
                        if (toBeAdded.length > 0) { await Apify.pushData(toBeAdded); }
                    }
                } else if (enqueuingReady) { // If not, enqueue the detail pages to be extracted.
                    console.log('enqueuing detail pages...');
                    //await enqueueLinks(page, requestQueue, '.hotel_name_link', null, 'detail',
                    //    fixUrl('&', input), (link) => getAttribute(link, 'textContent'));
                    const urlMod = fixUrl('&', input);
                    const keyMod = (link) => getAttribute(link, 'textContent');
                    const prItem = await page.$('.bui-pagination__info');
                    const pageRange = (await getAttribute(prItem, 'textContent')).match(/\d+/g);
                    const firstItem = parseInt(pageRange[0]);
                    const links = await page.$$('.hotel_name_link');
                    for (let iLink = 0; iLink < links.length; iLink++) {
                        const link = links[iLink];
                        const href = await getAttribute(link, 'href');
                        if (href) {
                            await requestQueue.addRequest(new Apify.Request({
github apifytech / apify-js / examples / forms.js View on Github external
// Submit the form and wait for full load of next page
    console.log('Submit search form');
    await Promise.all([
        page.waitForNavigation(),
        page.click('#adv_code_search button[type="submit"]')
    ]);

    // Obtain and print list of search results
    const results = await page.$$eval('div.codesearch-results ul.repo-list li h3 a', nodes => nodes.map(node => ({
        url: node.href,
        name: node.innerText
    })));
    console.log('Results:', results);

    // Store data in default dataset
    await Apify.pushData(results);
});
github cermak-petr / actor-booking-scraper / src / main.js View on Github external
const pageUrl = await page.url();
                if (!input.startUrls && pageUrl.indexOf('label') < 0) {
                    await retireBrowser();
                    return;
                }

                // Exit if core data is not present ot the rating is too low.
                if (!ld || (ld.aggregateRating && ld.aggregateRating.ratingValue <= (input.minScore || 0))) {
                    return;
                }
                
                // Extract the data.
                console.log('extracting detail...');
                const detail = await extractDetail(page, ld, input, request.userData);
                console.log('detail extracted');
                await Apify.pushData(detail);
                return;
            } else { // Handle hotel list page.
                
                const filtered = await isFiltered(page);
                const settingFilters = input.useFilters && !filtered;
                const settingMinMaxPrice = input.minMaxPrice != 'none' && !await isMinMaxPriceSet(page, input);
                const settingPropertyType = input.propertyType != 'none' && !await isPropertyTypeSet(page, input);
                const enqueuingReady = !(settingFilters || settingMinMaxPrice || settingPropertyType);
                
                // Check if the page was open through working proxy.
                const pageUrl = await page.url();
                if (!input.startUrls && pageUrl.indexOf(sortBy) < 0) {
                    await retireBrowser();
                    return;
                }
github VaclavRut / actor-amazon-crawler / src / utils.js View on Github external
} else {
            await Apify.pushData({
                status: 'No sellers for this keyword.',
                keyword: request.userData.keyword,
            });
        }
    } else if (type === 'RESULT') {
        if (input.maxResults) {
            if (await checkSaveCount(datasetId, input.maxResults) === true) {
                await Apify.pushData(item);
            } else {
                console.log('Finished');
                process.exit(0);
            }
        } else {
            await Apify.pushData(item);
        }
    }
}
github apifytech / actor-scraper / web-scraper / src / crawler_setup.js View on Github external
async _handleResult(request, response, pageFunctionResult, isError) {
        const start = process.hrtime();
        const payload = tools.createDatasetPayload(request, response, pageFunctionResult, isError);
        await Apify.pushData(payload);
        this.pagesOutputted++;
        tools.logPerformance(request, 'handleResult EXECUTION', start);
    }
github apifytech / actor-scraper / cheerio-scraper / src / crawler_setup.js View on Github external
async _handleResult(request, response, pageFunctionResult, isError) {
        const payload = tools.createDatasetPayload(request, response, pageFunctionResult, isError);
        await Apify.pushData(payload);
        this.pagesOutputted++;
    }
}

apify

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

Apache-2.0
Latest version published 25 days ago

Package Health Score

84 / 100
Full package analysis