How to use the scrapelib.Scraper function in scrapelib

To help you get started, we’ve selected a few scrapelib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openstates / openstates / billy / bin / dump_json.py View on Github external
def dump_json(abbr, filename, validate, schema_dir):
    scraper = scrapelib.Scraper(requests_per_minute=600, follow_robots=False)
    level = metadata(abbr)['level']

    zip = zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED)

    if not schema_dir:
        cwd = os.path.split(__file__)[0]
        schema_dir = os.path.join(cwd, "../schemas/api/")

    with open(os.path.join(schema_dir, "bill.json")) as f:
        bill_schema = json.load(f)

    with open(os.path.join(schema_dir, "legislator.json")) as f:
        legislator_schema = json.load(f)

    with open(os.path.join(schema_dir, "committee.json")) as f:
        committee_schema = json.load(f)
github openstates / openstates / openstates / ct / __init__.py View on Github external
def get_session_list(self):
        text = scrapelib.Scraper().get("ftp://ftp.cga.ct.gov").text
        sessions = [line.split()[-1] for line in text.splitlines()]
        return [session for session in sessions if session not in SKIP_SESSIONS]
github openstates / openstates / openstates / nd / __init__.py View on Github external
def get_session_list(self):
        import scrapelib
        import lxml.html

        url = "http://www.legis.nd.gov/assembly/"
        html = scrapelib.Scraper().get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        sessions = doc.xpath("//div[@class='view-content']//a/text()")
        sessions = [
            session for session in sessions if "Territorial Assembly" not in session
        ]
        return sessions
github openstates / openstates / scrapers / va / __init__.py View on Github external
def get_session_list(self):
        scraper = scrapelib.Scraper(requests_per_minute=40)
        vals = url_xpath('http://lis.virginia.gov', '//div[@id = "sLink"]//option[@value != "01"]/@value', requester=scraper)
        sessions = [get_session_id(val, scraper) for val in vals]
        return [session for session in sessions if session is not None]
github unitedstates / petitions / scripts / petitions.py View on Github external
#!/usr/bin/env python
from StringIO import StringIO
import argparse
import json
from datetime import datetime
import scrapelib
from lxml.html import etree
from utils import log, download, write, log_dir

#intialize scraper and parser
s = scrapelib.Scraper(requests_per_minute=60, follow_robots=False)
parser = etree.HTMLParser()

scrapelog = {
    "begin" : datetime.now().strftime("%Y-%m-%d-%H:%M:%S"),
    "signatures": {}
}

def petitions(start=1, mx=None):
    if mx is None:
        mx = -1
    
    #log objects for tracking signatures over time
    hits = 0
    
    #scan WH site, add any new petitions to DB
    #surely a better way to get indefinite number of results than to create a functionally infinite loop, then breaking it, but drawing a blank
github unitedstates / petitions / scripts / utils.py View on Github external
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text  # leave as is

    text = re.sub("&#?\w+;", fixup, text)
    text = remove_unicode_control(text)
    return text


##### Downloading
#the os.getcwd() (current working directory) is for file system access in Cloud9IDE


import scrapelib
scraper = scrapelib.Scraper(requests_per_minute=120, follow_robots=False, retry_attempts=3)


# uses config values if present
def cache_dir():
    return os.getcwd() + "/cache"

# uses config values if present
def data_dir():
    return os.getcwd() + "/data"

# uses config values if present
def log_dir():
    return os.getcwd() + "/log"

def download(url, destination, force=False, options=None):
    if not options:
github opencivicdata / python-legistar-scraper / scripts / guessdomains.py View on Github external
import re
import os
import csv
import time
import logging
import logging.config
from os.path import join

import scrapelib

path = '/home/thom/sunlight/python-opencivicdata/opencivicdata/division-ids/identifiers/country-us'


class Checker(scrapelib.Scraper):

    OUTFILE = 'domains.csv'
    SCRAPELIB_RPM = 10
    SCRAPELIB_TIMEOUT = 60
    SCRAPELIB_RETRY_ATTEMPTS = 0
    SCRAPELIB_RETRY_WAIT_SECONDS = 20
    FASTMODE = True
    # PROXIES = dict(http="http://localhost", https='https://localhost')
    BOGUS_DOMAIN_MESSAGE = 'Invalid parameters!!'

    def __init__(self):
        super().__init__()
        self.checked_places = set()
        logging.config.dictConfig(self.LOGGING_CONFIG)
        self.logger = logging.getLogger('legistar')