How to use the apify.utils function in apify

To help you get started, we’ve selected a few apify examples, based on popular ways it is used in public projects.

github apifytech / apify-js / examples / synchronous_run.js View on Github external
Apify.main(async () => {
    // Launch web browser.
    const browser = await Apify.launchPuppeteer();

    // Load http://goldengatebridge75.org/news/webcam.html and get an IFRAME with the webcam stream
    console.log('Opening web page...');
    const page = await browser.newPage();
    await page.goto('http://goldengatebridge75.org/news/webcam.html');
    const iframe = (await page.frames()).pop();

    // Get webcam image element handle.
    const imageElementHandle = await iframe.$('.VideoColm img');

    // Give the webcam image some time to load.
    console.log('Waiting for page to load...');
    await Apify.utils.sleep(3000);

    // Get a screenshot of that image.
    const imageBuffer = await imageElementHandle.screenshot();
    console.log('Screenshot captured.');

    // Save the screenshot as the actor's output. By convention, similarly to "INPUT",
    // the actor's output is stored in the default key-value store under the "OUTPUT" key.
    await Apify.setValue('OUTPUT', imageBuffer, { contentType: 'image/jpeg' });
    console.log('Actor finished.');
});
github apifytech / actor-scraper / scraper-tools / src / tools.js View on Github external
// from being saved to dataset. It will just contain
    // the relevant metadata.
    let result = pageFunctionResult || {};

    // Validate the result.
    const type = typeof result;
    if (type !== 'object') {
        throw new Error(`Page function must return Object | Object[], but it returned ${type}.`);
    }

    // Metadata need to be appended to each item
    // to match results with dataset "lines".
    if (!Array.isArray(result)) result = [result];
    const meta = {
        '#error': isError,
        '#debug': Apify.utils.createRequestDebugInfo(request, response),
    };

    return result.map(item => Object.assign({}, item, meta));
};
github apifytech / apify-js / examples / cheerio_crawler.js View on Github external
/**
 * This example demonstrates how to use [`CheerioCrawler`](../api/cheeriocrawler)
 * to crawl a list of URLs from an external file,
 * load each URL using a plain HTTP request, parse the HTML using <a href="https://www.npmjs.com/package/cheerio" target="_blank">cheerio</a>
 * and extract some data from it: the page title and all H1 tags.
 *
 * To run this example on the Apify Platform, select the `Node.js 10 on Alpine Linux (apify/actor-node-basic)` base image
 * on the source tab of your actor configuration.
 */

const Apify = require('apify');

// Apify.utils contains various utilities, e.g. for logging.
// Here we turn off the logging of unimportant messages.
const { log } = Apify.utils;
log.setLevel(log.LEVELS.WARNING);

// A link to a list of Fortune 500 companies' websites available on GitHub.
const CSV_LINK = 'https://gist.githubusercontent.com/hrbrmstr/ae574201af3de035c684/raw/f1000.csv';

// Apify.main() function wraps the crawler logic (it is optional).
Apify.main(async () => {
    // Create an instance of the RequestList class that contains a list of URLs to crawl.
    // Here we download and parse the list of URLs from an external file.
    const requestList = new Apify.RequestList({
        sources: [{ requestsFromUrl: CSV_LINK }],
    });
    await requestList.initialize();

    // Create an instance of the CheerioCrawler class - a crawler
    // that automatically loads the URLs and parses their HTML using the cheerio library.
github drobnikj / crawler-google-places / src / places_crawler.js View on Github external
const Apify = require('apify');
const Globalize = require('globalize');

const DEFAULT_CRAWLER_LOCALIZATION = ['en', 'cs'];

Globalize.load(require('cldr-data').entireSupplemental());
Globalize.load(require('cldr-data').entireMainFor(...DEFAULT_CRAWLER_LOCALIZATION));

const { sleep, log } = Apify.utils;
const { injectJQuery, blockRequests } = Apify.utils.puppeteer;
const infiniteScroll = require('./infinite_scroll');
const { MAX_PAGE_RETRIES, DEFAULT_TIMEOUT, PLACE_TITLE_SEL } = require('./consts');
const { enqueueAllPlaceDetails } = require('./enqueue_places_crawler');
const { saveHTML, saveScreenshot, waitForGoogleMapLoader,
    parseReviewFromResponseBody, scrollTo } = require('./utils');

/**
 * This is the worst part - parsing data from place detail
 * @param page
 */
const extractPlaceDetail = async (page, request, searchString, includeReviews, includeImages, includeHistogram, includeOpeningHours, includePeopleAlsoSearch, proxyConfig) => {
    // Extract basic information
    await waitForGoogleMapLoader(page);
    await page.waitForSelector(PLACE_TITLE_SEL, { timeout: DEFAULT_TIMEOUT });
    const detail = await page.evaluate((placeTitleSel) => {
        return {
github apifytech / apify-cli / src / templates / cheerio_crawler / main.js View on Github external
handlePageFunction: async ({ request, page }) => {
            const title = await page.title();
            console.log(`Title of ${request.url}: ${title}`);
            await Apify.pushData({
                title,
                '#debug': Apify.utils.createRequestDebugInfo(request),
            });
            await Apify.utils.enqueueLinks({ page, selector: 'a', pseudoUrls, requestQueue });
        },
github apifytech / apify-js / examples / puppeteer_crawler.js View on Github external
handleFailedRequestFunction: async ({ request }) => {
            console.log(`Request ${request.url} failed too many times`);
            await Apify.pushData({
                '#debug': Apify.utils.createRequestDebugInfo(request),
            });
        },
    });
github apifytech / apify-cli / src / templates / cheerio_crawler / main.js View on Github external
handleFailedRequestFunction: async ({ request }) => {
            console.log(`Request ${request.url} failed too many times`);
            await Apify.pushData({
                '#debug': Apify.utils.createRequestDebugInfo(request),
            });
        },
github apifytech / actor-scraper / cheerio-scraper / src / crawler_setup.js View on Github external
async _handleLinks($, request) {
        if (!(this.input.linkSelector && this.requestQueue)) return;
        const currentDepth = request.userData[META_KEY].depth;
        const hasReachedMaxDepth = this.input.maxCrawlingDepth && currentDepth >= this.input.maxCrawlingDepth;
        if (hasReachedMaxDepth) {
            log.debug(`Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`);
            return;
        }

        await Apify.utils.enqueueLinks({
            $,
            selector: this.input.linkSelector,
            pseudoUrls: this.input.pseudoUrls,
            requestQueue: this.requestQueue,
            baseUrl: request.loadedUrl,
            transformRequestFunction: (requestOptions) => {
                requestOptions.userData = {
                    [META_KEY]: {
                        parentRequestId: request.id || request.uniqueKey,
                        depth: currentDepth + 1,
                    },
                };
                requestOptions.useExtendedUniqueKey = true;
                requestOptions.keepUrlFragment = this.input.keepUrlFragments;
                return requestOptions;
            },
github drobnikj / crawler-google-places / src / infinite_scroll.js View on Github external
const Apify = require('apify');
const { scrollTo } = require('./utils')

const { sleep, log } = Apify.utils;

const logInfo = (msg) => log.info(msg);
const logDebug = (msg) => log.debug(msg);

/**
 * Method returns info about page scroll
 */
const getPageScrollInfo = (page, elementToScroll) => page.evaluate((elementToScroll) => {
    return {
        scrollHeight: document.querySelector(elementToScroll).scrollHeight,
        scrollTop: document.querySelector(elementToScroll).scrollTop,
        clientHeight: document.querySelector(elementToScroll).clientHeight,
    };
}, elementToScroll);

/**
github apifytech / apify-cli / src / templates / basic_crawler / main.js View on Github external
handleRequestFunction: async ({ request }) => {
            await Apify.pushData({
                request,
                finishedAt: new Date(),
                html: await rp(request.url),
                '#debug': Apify.utils.createRequestDebugInfo(request),
            });
        },

apify

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

Apache-2.0
Latest version published 2 months ago

Package Health Score

84 / 100
Full package analysis