How to use apify-client - 10 common examples

To help you get started, we’ve selected a few apify-client examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github apifytech / apify-js / src / crawlers / basic_crawler.js View on Github external
maxRequestRetries,
            maxRequestsPerCrawl,
            autoscaledPoolOptions,

            // AutoscaledPool shorthands
            minConcurrency,
            maxConcurrency,
        } = _.defaults({}, options, DEFAULT_OPTIONS);

        checkParamPrototypeOrThrow(requestList, 'options.requestList', RequestList, 'Apify.RequestList', true);
        checkParamPrototypeOrThrow(requestQueue, 'options.requestQueue', [RequestQueue, RequestQueueLocal], 'Apify.RequestQueue', true);
        checkParamOrThrow(handleRequestFunction, 'options.handleRequestFunction', 'Function');
        checkParamOrThrow(handleRequestTimeoutSecs, 'options.handleRequestTimeoutSecs', 'Number');
        checkParamOrThrow(handleFailedRequestFunction, 'options.handleFailedRequestFunction', 'Function');
        checkParamOrThrow(maxRequestRetries, 'options.maxRequestRetries', 'Number');
        checkParamOrThrow(maxRequestsPerCrawl, 'options.maxRequestsPerCrawl', 'Maybe Number');
        checkParamOrThrow(autoscaledPoolOptions, 'options.autoscaledPoolOptions', 'Object');

        if (!requestList && !requestQueue) {
            throw new Error('At least one of the parameters "options.requestList" and "options.requestQueue" must be provided!');
        }

        this.requestList = requestList;
        this.requestQueue = requestQueue;
        this.handleRequestFunction = handleRequestFunction;
        this.handleRequestTimeoutSecs = handleRequestTimeoutSecs;
        this.handleFailedRequestFunction = handleFailedRequestFunction;
        this.maxRequestRetries = maxRequestRetries;
        this.handledRequestsCount = 0;
        this.stats = new Statistics({ logMessage: 'Crawler request statistics:' });

        let shouldLogMaxPagesExceeded = true;
github apifytech / apify-js / src / enqueue_links / enqueue_links.js View on Github external
if (userData) {
        log.deprecated('options.userData of Apify.utils.enqueueLinks() is deprecated. Use options.transformRequestFunction instead.');
    }

    checkParamOrThrow(page, 'page', 'Maybe Object');
    checkParamOrThrow($, '$', 'Maybe Function');
    if (!page && !$) {
        throw new Error('One of the parameters "options.page" or "options.$" must be provided!');
    }
    if (page && $) {
        throw new Error('Only one of the parameters "options.page" or "options.$" must be provided!');
    }
    checkParamOrThrow(selector, 'selector', 'String');
    checkParamPrototypeOrThrow(requestQueue, 'requestQueue', [RequestQueue, RequestQueueLocal], 'Apify.RequestQueue');
    checkParamOrThrow(baseUrl, 'baseUrl', 'Maybe String');
    if (baseUrl && page) log.warning('The parameter options.baseUrl can only be used when parsing a Cheerio object. It will be ignored.');
    checkParamOrThrow(pseudoUrls, 'pseudoUrls', 'Maybe Array');
    checkParamOrThrow(userData, 'userData', 'Maybe Object');
    checkParamOrThrow(transformRequestFunction, 'transformRequestFunction', 'Maybe Function');

    // Construct pseudoUrls from input where necessary.
    const pseudoUrlInstances = constructPseudoUrlInstances(pseudoUrls || []);

    const urls = page ? await extractUrlsFromPage(page, selector) : extractUrlsFromCheerio($, selector, baseUrl);
    let requestOptions = createRequestOptions(urls, userData);
    if (transformRequestFunction) {
        requestOptions = requestOptions.map(transformRequestFunction).filter(r => !!r);
    }
    const requests = createRequests(requestOptions, pseudoUrlInstances);
    return addRequestsToQueueInBatches(requests, requestQueue);
}
github apifytech / apify-js / src / puppeteer_pool.js View on Github external
launchPuppeteerOptions,
            recycleDiskCache,
            useIncognitoPages,
            proxyUrls,
            useLiveView,
        } = _.defaults({}, options, DEFAULT_OPTIONS);

        // Disabling due to memory leak.
        const reusePages = false;

        checkParamOrThrow(reusePages, 'options.reusePages', 'Boolean');
        checkParamOrThrow(maxOpenPagesPerInstance, 'options.maxOpenPagesPerInstance', 'Number');
        checkParamOrThrow(retireInstanceAfterRequestCount, 'options.retireInstanceAfterRequestCount', 'Number');
        checkParamOrThrow(launchPuppeteerFunction, 'options.launchPuppeteerFunction', 'Function');
        checkParamOrThrow(puppeteerOperationTimeoutSecs, 'options.puppeteerOperationTimeoutSecs', 'Number');
        checkParamOrThrow(instanceKillerIntervalMillis, 'options.instanceKillerIntervalMillis', 'Maybe Number');
        if (instanceKillerIntervalMillis) {
            log.deprecated('PuppeteerPool: options.instanceKillerIntervalMillis is deprecated, use options.instanceKillerIntervalSecs instead.');
        }
        checkParamOrThrow(instanceKillerIntervalSecs, 'options.instanceKillerIntervalSecs', 'Number');
        checkParamOrThrow(killInstanceAfterMillis, 'options.killInstanceAfterMillis', 'Maybe Number');
        if (killInstanceAfterMillis) {
            log.deprecated('PuppeteerPool: options.killInstanceAfterMillis is deprecated, use options.killInstanceAfterSecs instead.');
        }
        checkParamOrThrow(killInstanceAfterSecs, 'options.killInstanceAfterSecs', 'Number');
        checkParamOrThrow(launchPuppeteerOptions, 'options.launchPuppeteerOptions', 'Maybe Object');
        checkParamOrThrow(recycleDiskCache, 'options.recycleDiskCache', 'Maybe Boolean');
        checkParamOrThrow(useIncognitoPages, 'options.useIncognitoPages', 'Maybe Boolean');
        checkParamOrThrow(proxyUrls, 'options.proxyUrls', 'Maybe Array');
        // Enforce non-empty proxyUrls array
        if (proxyUrls && !proxyUrls.length) throw new Error('Parameter "options.proxyUrls" of type Array must not be empty');
        checkParamOrThrow(useLiveView, 'options.useLiveView', 'Maybe Boolean');
github apifytech / apify-js / src / request_list.js View on Github external
export const openRequestList = async (listName, sources, options = {}) => {
    checkParamOrThrow(listName, 'listName', 'String | Null');
    checkParamOrThrow(sources, 'sources', '[Object | String]');
    if (!sources.length) throw new Error('Parameter sources must not be an empty array.');
    checkParamOrThrow(options, 'options', 'Object');

    // Support both an array of strings and array of objects.
    if (typeof sources[0] === 'string') sources = sources.map(url => ({ url }));

    const rl = new RequestList({
        ...options,
        persistStateKey: listName ? `${listName}-${STATE_PERSISTENCE_KEY}` : null,
        persistSourcesKey: listName ? `${listName}-${SOURCES_PERSISTENCE_KEY}` : null,
        sources,
    });
    await rl.initialize();
    return rl;
};
github apifytech / apify-js / src / crawlers / cheerio_crawler.js View on Github external
maxRequestRetries,
            maxRequestsPerCrawl,
            handleFailedRequestFunction,
            autoscaledPoolOptions,
            prepareRequestFunction,
        } = _.defaults({}, options, DEFAULT_OPTIONS);

        checkParamOrThrow(handlePageFunction, 'options.handlePageFunction', 'Function');
        checkParamOrThrow(requestOptions, 'options.requestOptions', 'Maybe Object');
        checkParamOrThrow(requestTimeoutSecs, 'options.requestTimeoutSecs', 'Number');
        checkParamOrThrow(handlePageTimeoutSecs, 'options.handlePageTimeoutSecs', 'Number');
        checkParamOrThrow(ignoreSslErrors, 'options.ignoreSslErrors', 'Maybe Boolean');
        checkParamOrThrow(useApifyProxy, 'options.useApifyProxy', 'Maybe Boolean');
        checkParamOrThrow(apifyProxyGroups, 'options.apifyProxyGroups', 'Maybe [String]');
        checkParamOrThrow(apifyProxySession, 'options.apifyProxySession', 'Maybe String');
        checkParamOrThrow(proxyUrls, 'options.proxyUrls', 'Maybe [String]');
        checkParamOrThrow(prepareRequestFunction, 'options.prepareRequestFunction', 'Maybe Function');
        // Enforce valid proxy configuration
        if (proxyUrls && !proxyUrls.length) throw new Error('Parameter "options.proxyUrls" of type Array must not be empty');
        if (useApifyProxy && proxyUrls) throw new Error('Cannot combine "options.useApifyProxy" with "options.proxyUrls"!');

        this.requestOptions = requestOptions;
        this.handlePageFunction = handlePageFunction;
        this.handlePageTimeoutSecs = handlePageTimeoutSecs;
        this.requestTimeoutSecs = requestTimeoutSecs;
        this.ignoreSslErrors = ignoreSslErrors;
        this.useApifyProxy = useApifyProxy;
        this.apifyProxyGroups = apifyProxyGroups;
        this.apifyProxySession = apifyProxySession;
        this.proxyUrls = _.shuffle(proxyUrls);
        this.lastUsedProxyUrlIndex = 0;
        this.prepareRequestFunction = prepareRequestFunction;
github apifytech / apify-js / src / actor.js View on Github external
const { build, memoryMbytes, timeoutSecs, webhooks } = options;
    const runTaskOpts = { taskId };
    checkParamOrThrow(build, 'build', 'Maybe String');
    checkParamOrThrow(memoryMbytes, 'memoryMbytes', 'Maybe Number');
    checkParamOrThrow(timeoutSecs, 'timeoutSecs', 'Maybe Number');
    checkParamOrThrow(webhooks, 'webhooks', 'Maybe Array');
    if (token) runTaskOpts.token = token;
    if (build) runTaskOpts.build = build;
    if (memoryMbytes) runTaskOpts.memory = memoryMbytes;
    if (timeoutSecs >= 0) runTaskOpts.timeout = timeoutSecs; // Zero is valid value!
    if (input) runTaskOpts.input = input;
    if (webhooks) runTaskOpts.webhooks = webhooks;

    // Start task.
    const { waitSecs } = options;
    checkParamOrThrow(waitSecs, 'waitSecs', 'Maybe Number');
    const run = await tasks.runTask(runTaskOpts);
    if (waitSecs <= 0) return run; // In this case there is nothing more to do.

    // Wait for run to finish.
    const updatedRun = await waitForRunToFinish({
        actId: run.actId,
        runId: run.id,
        token,
        waitSecs,
        taskId,
    });

    // Finish if output is not requested or run haven't finished.
    const { fetchOutput = true } = options;
    if (!fetchOutput || updatedRun.status !== ACT_JOB_STATUSES.SUCCEEDED) return updatedRun;
github apifytech / apify-js / src / puppeteer_pool.js View on Github external
useLiveView,
        } = _.defaults({}, options, DEFAULT_OPTIONS);

        // Disabling due to memory leak.
        const reusePages = false;

        checkParamOrThrow(reusePages, 'options.reusePages', 'Boolean');
        checkParamOrThrow(maxOpenPagesPerInstance, 'options.maxOpenPagesPerInstance', 'Number');
        checkParamOrThrow(retireInstanceAfterRequestCount, 'options.retireInstanceAfterRequestCount', 'Number');
        checkParamOrThrow(launchPuppeteerFunction, 'options.launchPuppeteerFunction', 'Function');
        checkParamOrThrow(puppeteerOperationTimeoutSecs, 'options.puppeteerOperationTimeoutSecs', 'Number');
        checkParamOrThrow(instanceKillerIntervalMillis, 'options.instanceKillerIntervalMillis', 'Maybe Number');
        if (instanceKillerIntervalMillis) {
            log.deprecated('PuppeteerPool: options.instanceKillerIntervalMillis is deprecated, use options.instanceKillerIntervalSecs instead.');
        }
        checkParamOrThrow(instanceKillerIntervalSecs, 'options.instanceKillerIntervalSecs', 'Number');
        checkParamOrThrow(killInstanceAfterMillis, 'options.killInstanceAfterMillis', 'Maybe Number');
        if (killInstanceAfterMillis) {
            log.deprecated('PuppeteerPool: options.killInstanceAfterMillis is deprecated, use options.killInstanceAfterSecs instead.');
        }
        checkParamOrThrow(killInstanceAfterSecs, 'options.killInstanceAfterSecs', 'Number');
        checkParamOrThrow(launchPuppeteerOptions, 'options.launchPuppeteerOptions', 'Maybe Object');
        checkParamOrThrow(recycleDiskCache, 'options.recycleDiskCache', 'Maybe Boolean');
        checkParamOrThrow(useIncognitoPages, 'options.useIncognitoPages', 'Maybe Boolean');
        checkParamOrThrow(proxyUrls, 'options.proxyUrls', 'Maybe Array');
        // Enforce non-empty proxyUrls array
        if (proxyUrls && !proxyUrls.length) throw new Error('Parameter "options.proxyUrls" of type Array must not be empty');
        checkParamOrThrow(useLiveView, 'options.useLiveView', 'Maybe Boolean');

        // Config.
        this.reusePages = reusePages;
        this.maxOpenPagesPerInstance = maxOpenPagesPerInstance;
github apifytech / apify-js / src / utils.js View on Github external
export const newClient = () => {
    const opts = {
        userId: process.env[ENV_VARS.USER_ID] || null,
        token: process.env[ENV_VARS.TOKEN] || null,
    };

    // Only set baseUrl if overridden by env var, so that 'https://api.apify.com' is used by default.
    // This simplifies local development, which should run against production unless user wants otherwise.
    const apiBaseUrl = process.env[ENV_VARS.API_BASE_URL];
    if (apiBaseUrl) opts.baseUrl = apiBaseUrl;

    return new ApifyClient(opts);
};
github apifytech / apify-js / src / key_value_store.js View on Github external
async getValue(key) {
        validateGetValueParams(key);

        await this.initializationPromise;

        try {
            const result = await this._handleFile(key, readFilePromised);
            return result
                ? parseBody(result.returnValue, mime.getType(result.fileName))
                : null;
        } catch (err) {
            throw new Error(`Error reading file '${key}' in directory '${this.localStoragePath}' referred by ${ENV_VARS.LOCAL_STORAGE_DIR} environment variable: ${err.message}`); // eslint-disable-line
        }
    }
github apifytech / apify-js / src / dataset.js View on Github external
export const openDataset = (datasetIdOrName, options = {}) => {
    checkParamOrThrow(datasetIdOrName, 'datasetIdOrName', 'Maybe String');
    checkParamOrThrow(options, 'options', 'Object');
    ensureTokenOrLocalStorageEnvExists('dataset');

    const { forceCloud = false } = options;
    checkParamOrThrow(forceCloud, 'options.forceCloud', 'Boolean');

    return process.env[ENV_VARS.LOCAL_STORAGE_DIR] && !forceCloud
        ? openLocalStorage(datasetIdOrName, ENV_VARS.DEFAULT_DATASET_ID, DatasetLocal, datasetsCache)
        : openRemoteStorage(datasetIdOrName, ENV_VARS.DEFAULT_DATASET_ID, Dataset, datasetsCache, getOrCreateDataset);
};

apify-client

Apify API client for JavaScript

Apache-2.0
Latest version published 16 days ago

Package Health Score

84 / 100
Full package analysis