Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
export const main = (userFunc) => {
if (!userFunc || typeof (userFunc) !== 'function') {
throw new Error(`Apify.main() accepts a single parameter that must be a function (was '${userFunc === null ? 'null' : typeof (userFunc)}').`);
}
if (!process.env[ENV_VARS.LOCAL_STORAGE_DIR] && !process.env[ENV_VARS.TOKEN]) {
const dir = path.join(process.cwd(), './apify_storage');
process.env[ENV_VARS.LOCAL_STORAGE_DIR] = dir;
log.warning(`Neither ${ENV_VARS.LOCAL_STORAGE_DIR} nor ${ENV_VARS.TOKEN} environment variable is set, defaulting to ${ENV_VARS.LOCAL_STORAGE_DIR}="${dir}"`); // eslint-disable-line max-len
}
// This is to enable unit tests where process.exit() is mocked and doesn't really exit the process
// Note that mocked process.exit() might throw, so set exited flag before calling it to avoid confusion.
let exited = false;
const exitWithError = (err, exitCode, message) => {
log.exception(err, message);
exited = true;
// console.log(`Exiting with code: ${exitCode}`);
process.exit(exitCode);
};
// Set dummy interval to ensure the process will not be killed while awaiting empty promise:
// await new Promise(() => {})
// Such a construct is used for testing of actor timeouts and aborts.
const intervalId = setInterval(_.noop, 9999999);
*
* Example usage:
* ```
* const matches = text.match(Apify.utils.social.YOUTUBE_REGEX_GLOBAL);
* if (matches) console.log(`${matches.length} Youtube videos found!`);
* ```
* @type {RegExp}
* @memberOf social
*/
YOUTUBE_REGEX_GLOBAL = new RegExp(YOUTUBE_REGEX_STRING, 'ig');
} catch (e) {
// Older versions of Node don't support negative lookbehind and lookahead expressions.
// Show warning instead of failing.
if (e && e.message && e.message.includes('Invalid group')) {
// eslint-disable-next-line max-len
log.warning(`Your version of Node.js (${process.version}) doesn't support the regular expression syntax used by Apify.utils.social tools. The tools will not work. Please upgrade your Node.js to the latest version.`);
} else {
throw e;
}
}
/**
* The function attempts to extract emails, phone numbers and social profile URLs from a HTML document,
* specifically LinkedIn, Twitter, Instagram and Facebook profile URLs.
* The function removes duplicates from the resulting arrays and sorts the items alphabetically.
*
* The result of the function is an object with the following structure:
* ```
* {
* emails: String[],
* phones: String[],
});
}
const { queueModifiedAt, wasLimitReached, prevLimit, queryStartedAt, hadMultipleClients } = await this.queryQueueHeadPromise;
// TODO: I feel this code below can be greatly simplified...
// If queue is still empty then one of the following holds:
// - the other calls waiting for this promise already consumed all the returned requests
// - the limit was too low and contained only requests in progress
// - the writes from other clients were not propagated yet
// - the whole queue was processed and we are done
// If limit was not reached in the call then there are no more requests to be returned.
if (prevLimit >= REQUEST_QUEUE_HEAD_MAX_LIMIT) {
log.warning(`RequestQueue: Reached the maximum number of requests in progress: ${REQUEST_QUEUE_HEAD_MAX_LIMIT}.`);
}
const shouldRepeatWithHigherLimit = this.queueHeadDict.length() === 0
&& wasLimitReached
&& prevLimit < REQUEST_QUEUE_HEAD_MAX_LIMIT;
// If ensureConsistency=true then we must ensure that either:
// - queueModifiedAt is older than queryStartedAt by at least API_PROCESSED_REQUESTS_DELAY_MILLIS
// - hadMultipleClients=false and this.assumedTotalCount<=this.assumedHandledCount
const isDatabaseConsistent = queryStartedAt - queueModifiedAt >= API_PROCESSED_REQUESTS_DELAY_MILLIS;
const isLocallyConsistent = !hadMultipleClients && this.assumedTotalCount <= this.assumedHandledCount;
// Consistent information from one source is enough to consider request queue finished.
const shouldRepeatForConsistency = ensureConsistency && !isDatabaseConsistent && !isLocallyConsistent;
// If both are false then head is consistent and we may exit.
if (!shouldRepeatWithHigherLimit && !shouldRepeatForConsistency) return true;
// WORKAROUND:
// It happened to some users that state object contained something like:
// {
// "nextIndex": 11308,
// "nextUniqueKey": "https://www.anychart.com",
// "inProgress": {
// "https://www.ams360.com": true,
// ...
// "https://www.anychart.com": true,
// }
// Which then caused error "The request is not being processed (uniqueKey: https://www.anychart.com)"
// As a workaround, we just remove all inProgress requests whose index >= nextIndex,
// since they will be crawled again.
if (deleteFromInProgress.length) {
log.warning('RequestList\'s in-progress field is not consistent, skipping invalid in-progress entries', { deleteFromInProgress });
_.each(deleteFromInProgress, (uniqueKey) => {
delete state.inProgress[uniqueKey];
});
}
this.nextIndex = state.nextIndex;
this.inProgress = state.inProgress;
// All in-progress requests need to be recrawled
this.reclaimed = _.clone(this.inProgress);
}
this.reject = reject;
});
this._maybeRunTask();
// This is here because if we scale down to let's say 1, then after each promise is finished
// this._maybeRunTask() doesn't trigger another one. So if that 1 instance gets stuck it results
// in whole actor to get stuck and even after scaling up it never triggers another promise.
this.maybeRunTaskInterval = setInterval(() => this._maybeRunTask(), this.maybeRunIntervalMillis);
// This interval checks memory and in each call saves current memory stats and in every
// SCALE_UP_INTERVAL-th/SCALE_DOWN_INTERVAL-th call it may scale up/down based on memory.
if (isAtHome()) {
this.autoscaleInterval = setInterval(() => this._autoscale(), AUTOSCALE_INTERVAL_MILLIS);
} else {
log.warning('Auto-scaling is currently available only when running on Apify platform! '
+ 'Use `minConcurrency` parameter if you need to test multiple requests in parallel. '
+ 'This feature will be enabled soon.');
}
return this.poolPromise
.then(() => {
this._destroy();
})
.catch((err) => {
this._destroy();
throw err;
});
}
export const enqueueLinks = async (...args) => {
// TODO: Remove after v1.0.0 gets released.
// Refactor enqueueLinks to use an options object and keep backwards compatibility
let page, $, selector, requestQueue, pseudoUrls, userData; // eslint-disable-line
if (args.length === 1) {
[{ page, $, selector = 'a', requestQueue, pseudoUrls, userData = {} }] = args;
} else {
[page, selector = 'a', requestQueue, pseudoUrls, userData = {}] = args;
if (logDeprecationWarning) {
log.warning('Passing individual arguments to enqueueLinks() is deprecated. '
+ 'Use an options object: enqueueLinks({ page, selector, requestQueue, pseudoUrls, userData }) instead.');
logDeprecationWarning = false;
}
}
// Check for pseudoUrls as a third parameter.
if (Array.isArray(requestQueue)) {
const tmp = requestQueue;
requestQueue = pseudoUrls;
pseudoUrls = tmp;
}
checkParamOrThrow(page, 'page', 'Maybe Object');
checkParamOrThrow($, '$', 'Maybe Function');
if (!page && !$) {
throw new Error('One of the parameters "options.page" or "options.$" must be provided!');
import PseudoUrl from './pseudo_url';
import LiveViewServer from './live_view/live_view_server';
import { requestAsBrowser } from './utils_request';
import { openSessionPool } from './session_pool/session_pool';
import { Session } from './session_pool/session';
// Increase the global limit for event emitter memory leak warnings.
EventEmitter.defaultMaxListeners = 50;
// Log as plain text not JSON
log.logJson = false;
// TODO: remove this when we release v1.0.0
const EMULATION_ENV_VAR = 'APIFY_LOCAL_EMULATION_DIR';
if (process.env[EMULATION_ENV_VAR]) {
log.warning(`Environment variable "${EMULATION_ENV_VAR}" is deprecated!!! Use "${ENV_VARS.LOCAL_STORAGE_DIR}" instead!`);
if (!process.env[ENV_VARS.LOCAL_STORAGE_DIR]) process.env[ENV_VARS.LOCAL_STORAGE_DIR] = process.env[EMULATION_ENV_VAR];
}
// Logging some basic system info (apify and apify-client version, NodeJS version, ...).
logSystemInfo();
// Log warning if SDK is outdated.
printOutdatedSdkWarning();
/**
* The following section describes all functions and properties provided by the `apify` package,
* except individual classes and namespaces that have their separate, detailed, documentation pages
* accessible from the left sidebar.
*
* @module Apify
*/
constructor(opts) {
// TODO: remove this when we release v1.0.0
// For backwards compatibility with opts.workerFunction.
if (opts.workerFunction) {
// For backwards compatiblity with opts.finishWhenEmpty and this.finish();
if (opts.finishWhenEmpty !== undefined) {
log.warning('AutoscaledPool: Parameter `finishWhenEmpty` is deprecated!!! Use `isFinishedFunction` instead!');
checkParamOrThrow(opts.finishWhenEmpty, 'opts.finishWhenEmpty', 'Boolean');
let mayFinish = false;
opts.isFinishedFunction = () => Promise.resolve(mayFinish);
this.finish = () => { mayFinish = true; };
} else {
opts.isFinishedFunction = () => Promise.resolve(true);
}
log.warning('AutoscaledPool: Parameter `workerFunction` is deprecated!!! Use `runTaskFunction` instead!');
checkParamOrThrow(opts.workerFunction, 'opts.workerFunction', 'Function');
opts.runTaskFunction = opts.workerFunction;
opts.isTaskReadyFunction = () => Promise.resolve(true);
}
const {
maxConcurrency,
export async function clickElements(page, selector) {
const elementHandles = await page.$$(selector);
log.debug(`enqueueLinksByClickingElements: There are ${elementHandles.length} elements to click.`);
let clickedElementsCount = 0;
let zIndex = STARTING_Z_INDEX;
let shouldLogWarning = true;
for (const handle of elementHandles) {
try {
await page.evaluate(updateElementCssToEnableMouseClick, handle, zIndex++);
await handle.click();
clickedElementsCount++;
} catch (err) {
if (shouldLogWarning && err.stack.includes('is detached from document')) {
log.warning(`An element with selector ${selector} that you're trying to click has been removed from the page. `
+ 'This was probably caused by an earlier click which triggered some JavaScript on the page that caused it to change. '
+ 'If you\'re trying to enqueue pagination links, we suggest using the "next" button, if available and going one by one.');
shouldLogWarning = false;
}
log.debug('enqueueLinksByClickingElements: Click failed.', { stack: err.stack });
}
}
log.debug(`enqueueLinksByClickingElements: Successfully clicked ${clickedElementsCount} elements out of ${elementHandles.length}`);
}
enqueueLinks: async (...args) => {
if (logEnqueueLinksDeprecationWarning) {
log.warning('Using enqueueLinks() from the Apify.utils.puppeteer namespace is deprecated. '
+ 'Please use the Apify.utils.enqueueLinks().');
logEnqueueLinksDeprecationWarning = false;
return enqueueLinks(...args);
}
},
enqueueLinksByClickingElements,