Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
maxRequestRetries,
maxRequestsPerCrawl,
autoscaledPoolOptions,
// AutoscaledPool shorthands
minConcurrency,
maxConcurrency,
} = _.defaults({}, options, DEFAULT_OPTIONS);
checkParamPrototypeOrThrow(requestList, 'options.requestList', RequestList, 'Apify.RequestList', true);
checkParamPrototypeOrThrow(requestQueue, 'options.requestQueue', [RequestQueue, RequestQueueLocal], 'Apify.RequestQueue', true);
checkParamOrThrow(handleRequestFunction, 'options.handleRequestFunction', 'Function');
checkParamOrThrow(handleRequestTimeoutSecs, 'options.handleRequestTimeoutSecs', 'Number');
checkParamOrThrow(handleFailedRequestFunction, 'options.handleFailedRequestFunction', 'Function');
checkParamOrThrow(maxRequestRetries, 'options.maxRequestRetries', 'Number');
checkParamOrThrow(maxRequestsPerCrawl, 'options.maxRequestsPerCrawl', 'Maybe Number');
checkParamOrThrow(autoscaledPoolOptions, 'options.autoscaledPoolOptions', 'Object');
if (!requestList && !requestQueue) {
throw new Error('At least one of the parameters "options.requestList" and "options.requestQueue" must be provided!');
}
this.requestList = requestList;
this.requestQueue = requestQueue;
this.handleRequestFunction = handleRequestFunction;
this.handleRequestTimeoutSecs = handleRequestTimeoutSecs;
this.handleFailedRequestFunction = handleFailedRequestFunction;
this.maxRequestRetries = maxRequestRetries;
this.handledRequestsCount = 0;
this.stats = new Statistics({ logMessage: 'Crawler request statistics:' });
let shouldLogMaxPagesExceeded = true;
if (userData) {
log.deprecated('options.userData of Apify.utils.enqueueLinks() is deprecated. Use options.transformRequestFunction instead.');
}
checkParamOrThrow(page, 'page', 'Maybe Object');
checkParamOrThrow($, '$', 'Maybe Function');
if (!page && !$) {
throw new Error('One of the parameters "options.page" or "options.$" must be provided!');
}
if (page && $) {
throw new Error('Only one of the parameters "options.page" or "options.$" must be provided!');
}
checkParamOrThrow(selector, 'selector', 'String');
checkParamPrototypeOrThrow(requestQueue, 'requestQueue', [RequestQueue, RequestQueueLocal], 'Apify.RequestQueue');
checkParamOrThrow(baseUrl, 'baseUrl', 'Maybe String');
if (baseUrl && page) log.warning('The parameter options.baseUrl can only be used when parsing a Cheerio object. It will be ignored.');
checkParamOrThrow(pseudoUrls, 'pseudoUrls', 'Maybe Array');
checkParamOrThrow(userData, 'userData', 'Maybe Object');
checkParamOrThrow(transformRequestFunction, 'transformRequestFunction', 'Maybe Function');
// Construct pseudoUrls from input where necessary.
const pseudoUrlInstances = constructPseudoUrlInstances(pseudoUrls || []);
const urls = page ? await extractUrlsFromPage(page, selector) : extractUrlsFromCheerio($, selector, baseUrl);
let requestOptions = createRequestOptions(urls, userData);
if (transformRequestFunction) {
requestOptions = requestOptions.map(transformRequestFunction).filter(r => !!r);
}
const requests = createRequests(requestOptions, pseudoUrlInstances);
return addRequestsToQueueInBatches(requests, requestQueue);
}
launchPuppeteerOptions,
recycleDiskCache,
useIncognitoPages,
proxyUrls,
useLiveView,
} = _.defaults({}, options, DEFAULT_OPTIONS);
// Disabling due to memory leak.
const reusePages = false;
checkParamOrThrow(reusePages, 'options.reusePages', 'Boolean');
checkParamOrThrow(maxOpenPagesPerInstance, 'options.maxOpenPagesPerInstance', 'Number');
checkParamOrThrow(retireInstanceAfterRequestCount, 'options.retireInstanceAfterRequestCount', 'Number');
checkParamOrThrow(launchPuppeteerFunction, 'options.launchPuppeteerFunction', 'Function');
checkParamOrThrow(puppeteerOperationTimeoutSecs, 'options.puppeteerOperationTimeoutSecs', 'Number');
checkParamOrThrow(instanceKillerIntervalMillis, 'options.instanceKillerIntervalMillis', 'Maybe Number');
if (instanceKillerIntervalMillis) {
log.deprecated('PuppeteerPool: options.instanceKillerIntervalMillis is deprecated, use options.instanceKillerIntervalSecs instead.');
}
checkParamOrThrow(instanceKillerIntervalSecs, 'options.instanceKillerIntervalSecs', 'Number');
checkParamOrThrow(killInstanceAfterMillis, 'options.killInstanceAfterMillis', 'Maybe Number');
if (killInstanceAfterMillis) {
log.deprecated('PuppeteerPool: options.killInstanceAfterMillis is deprecated, use options.killInstanceAfterSecs instead.');
}
checkParamOrThrow(killInstanceAfterSecs, 'options.killInstanceAfterSecs', 'Number');
checkParamOrThrow(launchPuppeteerOptions, 'options.launchPuppeteerOptions', 'Maybe Object');
checkParamOrThrow(recycleDiskCache, 'options.recycleDiskCache', 'Maybe Boolean');
checkParamOrThrow(useIncognitoPages, 'options.useIncognitoPages', 'Maybe Boolean');
checkParamOrThrow(proxyUrls, 'options.proxyUrls', 'Maybe Array');
// Enforce non-empty proxyUrls array
if (proxyUrls && !proxyUrls.length) throw new Error('Parameter "options.proxyUrls" of type Array must not be empty');
checkParamOrThrow(useLiveView, 'options.useLiveView', 'Maybe Boolean');
export const openRequestList = async (listName, sources, options = {}) => {
checkParamOrThrow(listName, 'listName', 'String | Null');
checkParamOrThrow(sources, 'sources', '[Object | String]');
if (!sources.length) throw new Error('Parameter sources must not be an empty array.');
checkParamOrThrow(options, 'options', 'Object');
// Support both an array of strings and array of objects.
if (typeof sources[0] === 'string') sources = sources.map(url => ({ url }));
const rl = new RequestList({
...options,
persistStateKey: listName ? `${listName}-${STATE_PERSISTENCE_KEY}` : null,
persistSourcesKey: listName ? `${listName}-${SOURCES_PERSISTENCE_KEY}` : null,
sources,
});
await rl.initialize();
return rl;
};
maxRequestRetries,
maxRequestsPerCrawl,
handleFailedRequestFunction,
autoscaledPoolOptions,
prepareRequestFunction,
} = _.defaults({}, options, DEFAULT_OPTIONS);
checkParamOrThrow(handlePageFunction, 'options.handlePageFunction', 'Function');
checkParamOrThrow(requestOptions, 'options.requestOptions', 'Maybe Object');
checkParamOrThrow(requestTimeoutSecs, 'options.requestTimeoutSecs', 'Number');
checkParamOrThrow(handlePageTimeoutSecs, 'options.handlePageTimeoutSecs', 'Number');
checkParamOrThrow(ignoreSslErrors, 'options.ignoreSslErrors', 'Maybe Boolean');
checkParamOrThrow(useApifyProxy, 'options.useApifyProxy', 'Maybe Boolean');
checkParamOrThrow(apifyProxyGroups, 'options.apifyProxyGroups', 'Maybe [String]');
checkParamOrThrow(apifyProxySession, 'options.apifyProxySession', 'Maybe String');
checkParamOrThrow(proxyUrls, 'options.proxyUrls', 'Maybe [String]');
checkParamOrThrow(prepareRequestFunction, 'options.prepareRequestFunction', 'Maybe Function');
// Enforce valid proxy configuration
if (proxyUrls && !proxyUrls.length) throw new Error('Parameter "options.proxyUrls" of type Array must not be empty');
if (useApifyProxy && proxyUrls) throw new Error('Cannot combine "options.useApifyProxy" with "options.proxyUrls"!');
this.requestOptions = requestOptions;
this.handlePageFunction = handlePageFunction;
this.handlePageTimeoutSecs = handlePageTimeoutSecs;
this.requestTimeoutSecs = requestTimeoutSecs;
this.ignoreSslErrors = ignoreSslErrors;
this.useApifyProxy = useApifyProxy;
this.apifyProxyGroups = apifyProxyGroups;
this.apifyProxySession = apifyProxySession;
this.proxyUrls = _.shuffle(proxyUrls);
this.lastUsedProxyUrlIndex = 0;
this.prepareRequestFunction = prepareRequestFunction;
const { build, memoryMbytes, timeoutSecs, webhooks } = options;
const runTaskOpts = { taskId };
checkParamOrThrow(build, 'build', 'Maybe String');
checkParamOrThrow(memoryMbytes, 'memoryMbytes', 'Maybe Number');
checkParamOrThrow(timeoutSecs, 'timeoutSecs', 'Maybe Number');
checkParamOrThrow(webhooks, 'webhooks', 'Maybe Array');
if (token) runTaskOpts.token = token;
if (build) runTaskOpts.build = build;
if (memoryMbytes) runTaskOpts.memory = memoryMbytes;
if (timeoutSecs >= 0) runTaskOpts.timeout = timeoutSecs; // Zero is valid value!
if (input) runTaskOpts.input = input;
if (webhooks) runTaskOpts.webhooks = webhooks;
// Start task.
const { waitSecs } = options;
checkParamOrThrow(waitSecs, 'waitSecs', 'Maybe Number');
const run = await tasks.runTask(runTaskOpts);
if (waitSecs <= 0) return run; // In this case there is nothing more to do.
// Wait for run to finish.
const updatedRun = await waitForRunToFinish({
actId: run.actId,
runId: run.id,
token,
waitSecs,
taskId,
});
// Finish if output is not requested or run haven't finished.
const { fetchOutput = true } = options;
if (!fetchOutput || updatedRun.status !== ACT_JOB_STATUSES.SUCCEEDED) return updatedRun;
useLiveView,
} = _.defaults({}, options, DEFAULT_OPTIONS);
// Disabling due to memory leak.
const reusePages = false;
checkParamOrThrow(reusePages, 'options.reusePages', 'Boolean');
checkParamOrThrow(maxOpenPagesPerInstance, 'options.maxOpenPagesPerInstance', 'Number');
checkParamOrThrow(retireInstanceAfterRequestCount, 'options.retireInstanceAfterRequestCount', 'Number');
checkParamOrThrow(launchPuppeteerFunction, 'options.launchPuppeteerFunction', 'Function');
checkParamOrThrow(puppeteerOperationTimeoutSecs, 'options.puppeteerOperationTimeoutSecs', 'Number');
checkParamOrThrow(instanceKillerIntervalMillis, 'options.instanceKillerIntervalMillis', 'Maybe Number');
if (instanceKillerIntervalMillis) {
log.deprecated('PuppeteerPool: options.instanceKillerIntervalMillis is deprecated, use options.instanceKillerIntervalSecs instead.');
}
checkParamOrThrow(instanceKillerIntervalSecs, 'options.instanceKillerIntervalSecs', 'Number');
checkParamOrThrow(killInstanceAfterMillis, 'options.killInstanceAfterMillis', 'Maybe Number');
if (killInstanceAfterMillis) {
log.deprecated('PuppeteerPool: options.killInstanceAfterMillis is deprecated, use options.killInstanceAfterSecs instead.');
}
checkParamOrThrow(killInstanceAfterSecs, 'options.killInstanceAfterSecs', 'Number');
checkParamOrThrow(launchPuppeteerOptions, 'options.launchPuppeteerOptions', 'Maybe Object');
checkParamOrThrow(recycleDiskCache, 'options.recycleDiskCache', 'Maybe Boolean');
checkParamOrThrow(useIncognitoPages, 'options.useIncognitoPages', 'Maybe Boolean');
checkParamOrThrow(proxyUrls, 'options.proxyUrls', 'Maybe Array');
// Enforce non-empty proxyUrls array
if (proxyUrls && !proxyUrls.length) throw new Error('Parameter "options.proxyUrls" of type Array must not be empty');
checkParamOrThrow(useLiveView, 'options.useLiveView', 'Maybe Boolean');
// Config.
this.reusePages = reusePages;
this.maxOpenPagesPerInstance = maxOpenPagesPerInstance;
export const newClient = () => {
const opts = {
userId: process.env[ENV_VARS.USER_ID] || null,
token: process.env[ENV_VARS.TOKEN] || null,
};
// Only set baseUrl if overridden by env var, so that 'https://api.apify.com' is used by default.
// This simplifies local development, which should run against production unless user wants otherwise.
const apiBaseUrl = process.env[ENV_VARS.API_BASE_URL];
if (apiBaseUrl) opts.baseUrl = apiBaseUrl;
return new ApifyClient(opts);
};
async getValue(key) {
validateGetValueParams(key);
await this.initializationPromise;
try {
const result = await this._handleFile(key, readFilePromised);
return result
? parseBody(result.returnValue, mime.getType(result.fileName))
: null;
} catch (err) {
throw new Error(`Error reading file '${key}' in directory '${this.localStoragePath}' referred by ${ENV_VARS.LOCAL_STORAGE_DIR} environment variable: ${err.message}`); // eslint-disable-line
}
}
export const openDataset = (datasetIdOrName, options = {}) => {
checkParamOrThrow(datasetIdOrName, 'datasetIdOrName', 'Maybe String');
checkParamOrThrow(options, 'options', 'Object');
ensureTokenOrLocalStorageEnvExists('dataset');
const { forceCloud = false } = options;
checkParamOrThrow(forceCloud, 'options.forceCloud', 'Boolean');
return process.env[ENV_VARS.LOCAL_STORAGE_DIR] && !forceCloud
? openLocalStorage(datasetIdOrName, ENV_VARS.DEFAULT_DATASET_ID, DatasetLocal, datasetsCache)
: openRemoteStorage(datasetIdOrName, ENV_VARS.DEFAULT_DATASET_ID, Dataset, datasetsCache, getOrCreateDataset);
};