How to use the apify.main function in apify

To help you get started, we’ve selected a few apify examples, based on popular ways it is used in public projects.

github apifytech / apify-js / examples / puppeteer_with_proxy.js View on Github external
/**
 * This example demonstrates how to load pages in headless Chrome / Puppeteer
 * over <a href="https://apify.com/docs/proxy" target="_blank">Apify Proxy</a>.
 * To make it work, you'll need an Apify Account
 * that has access to the proxy.
 * The proxy password is available on the <a href="https://my.apify.com/proxy" target="_blank">Proxy</a> page in the app.
 * Just set it to the `APIFY_PROXY_PASSWORD` [environment variable](../guides/environmentvariables)
 * or run the script using the CLI.
 *
 * To run this example on the Apify Platform, select the `Node.js 10 + Chrome on Debian (apify/actor-node-chrome)` base image
 * on the source tab of your actor configuration.
 */

const Apify = require('apify');

Apify.main(async () => {
    // Apify.launchPuppeteer() is similar to Puppeteer's launch() function.
    // It accepts the same parameters and returns a preconfigured Puppeteer.Browser instance.
    // Moreover, it accepts several additional options, such as useApifyProxy.
    const options = {
        useApifyProxy: true,
    };
    const browser = await Apify.launchPuppeteer(options);

    console.log('Running Puppeteer script...');

    // Proceed with a plain Puppeteer script.
    const page = await browser.newPage();
    const url = 'https://en.wikipedia.org/wiki/Main_Page';
    await page.goto(url);
    const title = await page.title();
github apifytech / apify-js / examples / crawler_puppeteer.js View on Github external
/**
 * This is example how to scrape Hacker News site (https://news.ycombinator.com) using Apify SDK and Puppeteer.
 *
 * Example uses:
 * - Apify PuppeteerCrawler to scrape pages using Puppeteer in parallel
 * - Apify Dataset to store data
 * - Apify RequestQueue to manage dynamic queue of pending and handled requests
 * - Puppeter to controll headless Chrome browser
 */

const Apify = require('apify');

Apify.main(async () => {
    // Get queue and enqueue first url.
    const requestQueue = await Apify.openRequestQueue();

    // Enqueue Start url.
    await requestQueue.addRequest(new Apify.Request({ url: 'https://news.ycombinator.com/' }));

    // Create crawler.
    const crawler = new Apify.PuppeteerCrawler({
        requestQueue,

        // This page is executed for each request.
        // If request failes then it's retried 3 times.
        // Parameter page is Puppeteers page object with loaded page.
        handlePageFunction: async ({ page, request }) => {
            console.log(`Processing ${request.url}...`);
github apifytech / apify-cli / src / templates / basic_crawler / main.js View on Github external
// This is the main Node.js source code file of your actor.
// It is referenced from the "scripts" section of the package.json file,
// so that it can be started by running "npm start".

// Include Apify SDK. For more information, see https://sdk.apify.com/
const Apify = require('apify');

const rp = require('request-promise');

Apify.main(async () => {
    // Get input of the actor (here only for demonstration purposes).
    // If you'd like to have your input checked and have Apify display
    // a user interface for it, add INPUT_SCHEMA.json file to your actor.
    // For more information, see https://apify.com/docs/actor/input-schema
    const input = await Apify.getInput();
    console.log('Input:');
    console.dir(input);

    if (!input || !input.sources) throw new Error('Input must be a JSON object with the "sources" field!');

    const requestList = await Apify.openRequestList('my-request-list', input.sources);

    // Create a basic crawler that will use request-promise to download
    // web pages from a given list of URLs
    const basicCrawler = new Apify.BasicCrawler({
        requestList,
github apifytech / apify-js / examples / call_actor.js View on Github external
* The script extracts the current Bitcoin prices from <a href="https://www.kraken.com/" target="_blank">Kraken.com</a>
 * and sends them to your email using the <a href="https://apify.com/apify/send-mail" target="_blank">apify/send-mail</a> actor.
 *
 * To make the example work, you'll need an <a href="https://my.apify.com/" target="_blank">Apify Account</a>.
 * Go to <a href="https://my.apify.com/account#/integrations" target="_blank">Account - Integrations</a> page to obtain your API token
 * and set it to the `APIFY_TOKEN` [environment variable](../guides/environmentvariables), or run the script using the CLI.
 * If you deploy this actor to the Apify Cloud then you can set up a scheduler for early
 * morning.
 *
 * To run this example on the Apify Platform, select the `Node.js 10 + Chrome on Debian (apify/actor-node-chrome)` base image
 * on the source tab of your actor configuration.
 */

const Apify = require('apify');

Apify.main(async () => {
    // Launch the web browser.
    const browser = await Apify.launchPuppeteer();

    console.log('Obtaining email address...');
    const user = await Apify.client.users.getUser();

    // Load Kraken.com charts and get last traded price of BTC
    console.log('Extracting data from kraken.com...');
    const page = await browser.newPage();
    await page.goto('https://www.kraken.com/charts');
    const tradedPricesHtml = await page.$eval('#ticker-top ul', el => el.outerHTML);

    // Send prices to your email. For that, you can use an actor we already
    // have available on the platform under the name: apify/send-mail.
    // The second parameter to the Apify.call() invocation is the actor's
    // desired input. You can find the required input parameters by checking
github apifytech / apify-js / examples / crawler_cheerio.js View on Github external
* This is example how to scrape Hacker News site (https://news.ycombinator.com) using Apify SDK
 * with Cheerio and Request NPM packages.
 *
 * Example uses:
 * - Apify BasicCrawler to scrape pages in parallel
 * - Apify Dataset to store data
 * - Apify RequestQueue to manage dynamic queue of pending and handled requests
 * - Request NPM package to request html content of website
 * - Cherio NPM package to parse html and extract data
 */

const Apify = require('apify');
const rp = require('request-promise');
const cheerio = require('cheerio');

Apify.main(async () => {
    // Get queue and enqueue first url.
    const requestQueue = await Apify.openRequestQueue();

    // Enqueue Start url.
    await requestQueue.addRequest(new Apify.Request({ url: 'https://news.ycombinator.com/' }));

    // Create crawler.
    const crawler = new Apify.BasicCrawler({
        requestQueue,

        // This page is executed for each request.
        // If request failes then it's retried 3 times.
        handleRequestFunction: async ({ request }) => {
            console.log(`Processing ${request.url}...`);

            // Request html of page.
github apifytech / actor-scraper / cheerio-scraper / src / actor.js View on Github external
const Apify = require('apify');
const CrawlerSetup = require('./crawler_setup');

const { utils: { log } } = Apify;

Apify.main(async () => {
    log.debug('Reading INPUT.');
    const input = await Apify.getInput();
    if (!input) throw new Error('INPUT cannot be empty!');

    // Get crawler setup and startup options.
    log.info('Configuring Cheerio Scraper.');
    const setup = new CrawlerSetup(input);
    const crawler = await setup.createCrawler();

    log.info('Configuration completed. Starting the scrape.');
    await crawler.run();
    log.info('Cheerio Scraper finished.');
});
github apifytech / actor-scraper / puppeteer-scraper / src / actor.js View on Github external
const Apify = require('apify');
const CrawlerSetup = require('./crawler_setup');

const { utils: { log } } = Apify;

Apify.main(async () => {
    log.debug('Reading INPUT.');
    const input = await Apify.getInput();
    if (!input) throw new Error('INPUT cannot be empty!');

    // Get crawler setup and startup options.
    log.info('Configuring Puppeteer Scraper.');
    const setup = new CrawlerSetup(input);
    const crawler = await setup.createCrawler();

    log.info('Configuration completed. Starting the scrape.');
    await crawler.run();
    log.info('Puppeteer Scraper finished.');
});
github apifytech / actor-scraper / web-scraper / src / actor.js View on Github external
const Apify = require('apify');
const CrawlerSetup = require('./crawler_setup');

const { utils: { log } } = Apify;

log.logJson = false;

Apify.main(async () => {
    log.debug('Reading INPUT.');
    const input = await Apify.getInput();
    if (!input) throw new Error('INPUT cannot be empty!');

    // Get crawler setup and startup options.
    log.info('Configuring Web Scraper.');
    const setup = new CrawlerSetup(input);
    const crawler = await setup.createCrawler();

    log.info('Configuration completed. Starting the scrape.');
    await crawler.run();
    log.info('Web Scraper finished.');
});
github apifytech / apify-cli / src / templates / puppeteer_crawler / main.js View on Github external
/**
 * This example demonstrates how to use PuppeteerCrawler in combination with RequestQueue
 * to recursively scrape Hacker News website (https://news.ycombinator.com)
 * using headless Chrome / Puppeteer.
 * The crawler starts with a single URL, finds links to next pages,
 * enqueues them and continues until no more desired links are available.
 * The results are stored to the default dataset. In local configuration,
 * the results are stored as JSON files in `./apify_storage/datasets/default`
 */

const Apify = require('apify');

Apify.main(async () => {
    // Apify.openRequestQueue() is a factory to get a preconfigured RequestQueue instance.
    // We add our first request to it - the initial page the crawler will visit.
    const requestQueue = await Apify.openRequestQueue();
    await requestQueue.addRequest({ url: 'https://news.ycombinator.com/' });

    // Create an instance of the PuppeteerCrawler class - a crawler
    // that automatically loads the URLs in headless Chrome / Puppeteer.
    const crawler = new Apify.PuppeteerCrawler({
        requestQueue,

        // Here you can set options that are passed to the Apify.launchPuppeteer() function.
        // For example, you can set "slowMo" to slow down Puppeteer operations to simplify debugging
        launchPuppeteerOptions: { slowMo: 500 },

        // Stop crawling after several pages
        maxRequestsPerCrawl: 10,
github drobnikj / crawler-google-places / src / main.js View on Github external
const Apify = require('apify');
const placesCrawler = require('./places_crawler');
const resultJsonSchema = require('./result_item_schema');
const _ = require('lodash');
const { log } = Apify.utils;

Apify.main(async () => {
    const input = await Apify.getValue('INPUT');
    const { searchString, searchStringsArray, proxyConfig, lat, lng, maxCrawledPlaces, regularTestRun,
        includeReviews = true, includeImages = true, includeHistogram = true, includeOpeningHours = true,
        walker, debug } = input;

    if (debug) log.setLevel(log.LEVELS.DEBUG);
    if (!searchString && !searchStringsArray) throw new Error('Attribute searchString or searchStringsArray is missing in input.');
    if (proxyConfig && proxyConfig.apifyProxyGroups
        && (proxyConfig.apifyProxyGroups.includes('GOOGLESERP') || proxyConfig.apifyProxyGroups.includes('GOOGLE_SERP'))) {
        throw new Error('It is not possible to crawl google places with GOOGLE SERP proxy group. Please use a different one and rerun crawler.');
    }

    log.info('Scraping Google Places for search string:', searchString);

    const startRequests = [];
    let startUrlSearch = 'https://www.google.com/maps/search/';

apify

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

Apache-2.0
Latest version published 2 months ago

Package Health Score

84 / 100
Full package analysis

Similar packages