How to use crawler - 10 common examples

To help you get started, we’ve selected a few crawler examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github atd-schubert / node-webcheck / lib / crawler.js View on Github external
opts.userAgent = opts.userAgent || "node-webcheck";
  opts.retries = opts.retries || 3;
  opts.retryTimeout = opts.retryTimeout || 10000;
  opts.skipForeigners = opts.skipForeigners || false; // completly skip urls of other domains than the baseURL
  opts.followForeigners = opts.followForeigners || false; // just check links to foraign domains, but don't check links on this pages
  
  var emit = opts.eventEmitter = opts.eventEmitter || function(){}; // Add eventemitter if you want to...
  
  if (opts.forceUTF8 === undefined) opts.forceUTF8 = true;
  // at this moment duplicates are all skipped... // if (opts.skipDuplicates === undefined) opts.skipDuplicates = true;
  
  var res = {};
  
  emit("base", url);
  
  var c = new Crawler({
    "maxConnections": opts.maxConnections,
    "timeout": opts.timeout,
    "userAgent": opts.userAgent,
    "retries": opts.retries,
    "retryTimeout": opts.retryTimeout,
    "forceUTF8": opts.forceUTF8,
    "callback": function(error,result,$) {
      if (error) {
        return emit("crawlingError", error); //res[result.window.location.href] = {};
      }
      try {
        var po = res[result.request.href] = {};

        po.url = result.request.href;
        po.status = result.statusCode;
        po.headers = result.headers;
github christianvuerings / berkeleydir / legacy / server-crawler.js View on Github external
var Crawler = require("crawler").Crawler;
var Firebase = require('firebase');
var firebaseDB = new Firebase('https://berkeleydir.firebaseio.com/users');

var url = 'https://calnet.berkeley.edu/directory/details.pl?uid=';
var urls = [];
var numbers = [];
for(var i = 514101; i < 1200000; i++) {
    urls.push(url + i);
}

var c = new Crawler({
    "maxConnections": 10,

    // This will be called for each crawled page
    "callback": function(error, result ,$) {
        // $ is a jQuery instance scoped to the server-side DOM of the page
        var name = $('#content > p span:nth-child(2)').html();
        if (name) {
            var id = result.window.location._url.query.replace('uid=', '');
            var email = $('#content span:contains("Email:")').next().text();
            var person = {
                id: parseInt(id, 10),
                name: name,
                email: email
            };
            firebaseDB.child(id).set(person);
            console.log(person);
github ahkimkoo / neocrawler / proxyCollector / proxyCrawler.js View on Github external
var proxyCrawler = function(settings){
	this.settings = settings;

	crawler = new Crawler({
	      maxConnections: 1,
	      timeout: (1000 * 20)
	    });

	console.log("start proxy crawler.");
}
github adblockplus / abpcrawler / lib / main.js View on Github external
applicationReady.then(function()
  {
    let window = Services.wm.getMostRecentWindow("navigator:browser");
    run(window, urls, timeout, maxtabs, baseURL + "save", function()
    {
      Services.startup.quit(Services.startup.eAttemptQuit);
    });
  }, function(exception)
  {
github adblockplus / abpcrawler / lib / application.js View on Github external
Application_Session.prototype.run = function( finisher, catcher )
{
  this.finisher = finisher;
  this.catcher = catcher;
  if ( !this.runnable )
  {
    this._run_catch( new Error( "Application_Session is not runnable" ) );
    return;
  }
  this.runnable = false;

  this.current_crawler = new Crawler(
    this.instructions, this.outputs, this.window,
    this.time_limit, this.leave_open, this.n_tabs
  );


  if ( this.progress )
  {
    /*
     * Add an instance-specific notice member to the crawler's progress instance. This is cleaner than
     * bothering with a subclass of the progress-notification class.
     */
    this.current_crawler.progress.notice = function( notice )
    {
      notice( this );
    }.bind( this.current_crawler.progress, this.progress );
  }
github ltebean / spiderman / lib / pageProcessor.js View on Github external
function PageProcessor(name,config,performSegue){
	this.name=name;
	this.type=config.type;
	this.performSegue=performSegue;
	//init seques

	this.segues={};
	for(var i=0;i
github infinitbyte / gopa / ui / page / index.html View on Github external
$.get('/stats').done(function (data) {
                if(data["queue.check"]){
                    $("#checker_task_num").text(data["queue.check"].pop+" / "+data["queue.check"].push+", valid: "+data["checker.url"].valid_seed);
                    option.series[0].data[0].value = ((data["queue.check"].pop/data["queue.check"].push)*100).toFixed(2) - 0;
                }

                if(data["crawler.pipeline"]){
                    $("#crawler_task_num").text(safeGetValue(data["crawler.pipeline"].finished)+" / "+data["crawler.pipeline"].total+", error: "+safeGetValue(data["crawler.pipeline"].error)+", break: "+safeGetValue(data["crawler.pipeline"].break)+", queue: "+safeGetValue(data["queue.fetch"].pop)+" / "+data["queue.fetch"].push);
                    option.series[1].data[0].value = (((parseInt(safeGetValue(data["queue.fetch"].pop)))/parseInt(safeGetValue(data["queue.fetch"].push)))*100).toFixed(2) - 0;
                }

                myChart.setOption(option, true);
            });
github infinitbyte / gopa / ui / page / index.html View on Github external
$.get('/stats').done(function (data) {
                if(data["queue.check"]){
                    $("#checker_task_num").text(data["queue.check"].pop+" / "+data["queue.check"].push+", valid: "+data["checker.url"].valid_seed);
                    option.series[0].data[0].value = ((data["queue.check"].pop/data["queue.check"].push)*100).toFixed(2) - 0;
                }

                if(data["crawler.pipeline"]){
                    $("#crawler_task_num").text(safeGetValue(data["crawler.pipeline"].finished)+" / "+data["crawler.pipeline"].total+", error: "+safeGetValue(data["crawler.pipeline"].error)+", break: "+safeGetValue(data["crawler.pipeline"].break)+", queue: "+safeGetValue(data["queue.fetch"].pop)+" / "+data["queue.fetch"].push);
                    option.series[1].data[0].value = (((parseInt(safeGetValue(data["queue.fetch"].pop)))/parseInt(safeGetValue(data["queue.fetch"].push)))*100).toFixed(2) - 0;
                }

                myChart.setOption(option, true);
            });
github infinitbyte / gopa / static / assets / js / page / index.js View on Github external
$.get('/stats').done(function (data) {
        if(data["queue.check"]){
            $("#checker_task_num").text(data["queue.check"].pop+" / "+data["queue.check"].push+", valid: "+data["checker.url"].valid_seed);
        }

        if(data["crawler.pipeline"]){
            $("#crawler_task_num").text(safeGetValue(data["crawler.pipeline"].finished)+" / "+data["crawler.pipeline"].total+", error: "+safeGetValue(data["crawler.pipeline"].error)+", break: "+safeGetValue(data["crawler.pipeline"].break)+", queue: "+safeGetValue(data["queue.fetch"].pop)+" / "+data["queue.fetch"].push);
        }
    });
github infinitbyte / gopa / static / assets / js / page / index.js View on Github external
$.get('/stats').done(function (data) {
        if(data["queue.check"]){
            $("#checker_task_num").text(data["queue.check"].pop+" / "+data["queue.check"].push+", valid: "+data["checker.url"].valid_seed);
        }

        if(data["crawler.pipeline"]){
            $("#crawler_task_num").text(safeGetValue(data["crawler.pipeline"].finished)+" / "+data["crawler.pipeline"].total+", error: "+safeGetValue(data["crawler.pipeline"].error)+", break: "+safeGetValue(data["crawler.pipeline"].break)+", queue: "+safeGetValue(data["queue.fetch"].pop)+" / "+data["queue.fetch"].push);
        }
    });

crawler

Crawler is a web spider written with Nodejs. It gives you the full power of jQuery on the server to parse a big number of pages as they are downloaded, asynchronously

MIT
Latest version published 4 months ago

Package Health Score

65 / 100
Full package analysis