How to use the apify.Request function in apify

To help you get started, we’ve selected a few apify examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github cermak-petr / actor-booking-scraper / src / main.js View on Github external
console.log('enqueuing pagination pages...');
                        const pageSelector = '.bui-pagination__list a:not([aria-current])';
                        const countSelector = '.sorth1, .sr_header h1, .sr_header h2';
                        try{
                            await page.waitForSelector(pageSelector, {timeout: 60000});
                            const pageElem = await page.$(pageSelector);
                            const pageUrl = await getAttribute(pageElem, 'href');
                            await page.waitForSelector(countSelector);
                            const countElem = await page.$(countSelector);
                            const countData = (await getAttribute(countElem, 'textContent')).replace(/\.|,|\s/g, '').match(/\d+/);
                            if(countData){
                                const count = Math.ceil(parseInt(countData[0])/20);
                                console.log('pagination pages: ' + count);
                                for(let i = 0; i < count; i++){
                                    const newUrl = pageUrl.replace(/rows=(\d+)/, 'rows=20').replace(/offset=(\d+)/, 'offset=' + 20*i);
                                    await requestQueue.addRequest(new Apify.Request({
                                        url: addUrlParameters(newUrl, input),
                                        //url: baseUrl + '&rows=20&offset=' + 20*i, 
                                        userData: {label: 'page'}
                                    }));
                                }
                            }
                        }
                        catch(e){
                            console.log(e); 
                            await Apify.setValue('count_error.html', await page.content(), {contentType: 'text/html'});
                        }
                    }
                }
                
                // If property type is enabled, enqueue necessary page.
                if(settingPropertyType){
github apifytech / apify-js / examples / crawler_puppeteer.js View on Github external
Apify.main(async () => {
    // Get queue and enqueue first url.
    const requestQueue = await Apify.openRequestQueue();

    // Enqueue Start url.
    await requestQueue.addRequest(new Apify.Request({ url: 'https://news.ycombinator.com/' }));

    // Create crawler.
    const crawler = new Apify.PuppeteerCrawler({
        requestQueue,

        // This page is executed for each request.
        // If request failes then it's retried 3 times.
        // Parameter page is Puppeteers page object with loaded page.
        handlePageFunction: async ({ page, request }) => {
            console.log(`Processing ${request.url}...`);

            // Extract all posts.
            const pageFunction = ($posts) => {
                const data = [];

                $posts.forEach(($post) => {
github apifytech / apify-js / examples / crawler_cheerio.js View on Github external
title: $(el).find('.title a').text(),
                    rank: $(el).find('.rank').text(),
                    href: $(el).find('.title a').attr('href'),
                });
            });

            // Save data.
            await Apify.pushData(data);

            // Enqueue next page.
            const $moreLink = $('.morelink');
            if ($moreLink.length) {
                const path = $moreLink.attr('href')
                const url = `https://news.ycombinator.com/${path}`;

                await requestQueue.addRequest(new Apify.Request({ url }));
            } else {
                console.log(`Url ${request.url} is the last page!`);
            }
        },
github apifytech / actor-scraper / test / tools.js View on Github external
it('should work', async () => {
            const page = await browser.newPage();
            await page.setContent(PAGE_CONTENT);
            const linkSelector = 'a';
            const pseudoUrls = [
                { purl: 'https://example.com[.*]' },
            ];
            let id = 0;
            const requestQueue = await Apify.openRequestQueue();
            requestQueue.requests = [];
            requestQueue.addRequest = function (request) {
                requestQueue.requests.push(request);
                return { requestId: `some-${++id}` };
            };

            const request = new Apify.Request({ id: 'parent', url: 'https://www.example.com' });
            tools.ensureMetaData(request);

            await tools.enqueueLinks(page, linkSelector, pseudoUrls, requestQueue, request);

            expect(requestQueue.requests).to.have.lengthOf(3);
            requestQueue.requests.forEach((r) => {
                expect(r.userData[META_KEY].depth).to.be.eql(1);
                expect(r.userData[META_KEY].parentRequestId).to.be.eql('parent');
                expect(r.userData[META_KEY].childRequestIds).to.be.eql({});
            });
            const children = Object.keys(request.userData[META_KEY].childRequestIds);
            expect(children).to.have.lengthOf(3);
            children.forEach(c => expect(/^some-[123]$/.test(c)).to.be.eql(true));
        });
    });
github apifytech / actor-scraper / test / tools.js View on Github external
it('should work', () => {
            const request = new Apify.Request({ url: 'https://www.example.com' });
            tools.ensureMetaData(request);

            expect(request.userData[META_KEY]).to.be.an('object');
            const meta = request.userData[META_KEY];
            expect(meta.depth).to.be.eql(0);
            expect(meta.parentRequestId).to.be.eql(null);
            expect(meta.childRequestIds).to.be.eql({});
        });
    });
github cermak-petr / actor-booking-scraper / src / util.js View on Github external
module.exports.setMinMaxPrice = async (page, input, requestQueue) => {
    console.log('enqueuing min-max price page...');
    const urlMod = fixUrl('&', input);
    const fPrices = await (await page.$$('.filteroptions'))[0].$$('.filterelement');
    const index = pLabels.indexOf(input.minMaxPrice);
    const label = await (fPrices[index]).$('.filter_label');
    const fText = await getAttribute(label, 'textContent');
    console.log('Using filter: ' + fText);
    const href = await getAttribute(fPrices[index], 'href');
    await requestQueue.addRequest(new Apify.Request({
        userData: { label: 'page' },
        url: urlMod(href),
        uniqueKey: fText + '_' + 0,
    }));
};

apify

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

Apache-2.0
Latest version published 17 days ago

Package Health Score

84 / 100
Full package analysis