How to use scrapelib - 10 common examples

To help you get started, we’ve selected a few scrapelib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openstates / legacy-openstates.org / tweets / latest_tweet.py View on Github external
def main():
    conn = pymongo.Connection(settings.MONGO_HOST, settings.MONGO_PORT)
    tweets = conn['openstates_web']['tweets']
    data = urlopen('http://api.twitter.com/1/statuses/user_timeline.json?screen_name=openstates&count=1&trim_user=1')
    data = json.loads(data)
    tweets.drop()
    tweets.insert(data, safe=True)
github openstates / openstates / billy / bin / dump_json.py View on Github external
def dump_json(abbr, filename, validate, schema_dir):
    scraper = scrapelib.Scraper(requests_per_minute=600, follow_robots=False)
    level = metadata(abbr)['level']

    zip = zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED)

    if not schema_dir:
        cwd = os.path.split(__file__)[0]
        schema_dir = os.path.join(cwd, "../schemas/api/")

    with open(os.path.join(schema_dir, "bill.json")) as f:
        bill_schema = json.load(f)

    with open(os.path.join(schema_dir, "legislator.json")) as f:
        legislator_schema = json.load(f)

    with open(os.path.join(schema_dir, "committee.json")) as f:
        committee_schema = json.load(f)
github openstates / openstates / openstates / ct / __init__.py View on Github external
def get_session_list(self):
        text = scrapelib.Scraper().get("ftp://ftp.cga.ct.gov").text
        sessions = [line.split()[-1] for line in text.splitlines()]
        return [session for session in sessions if session not in SKIP_SESSIONS]
github openstates / openstates / openstates / nm / votes.py View on Github external
def scrape_vote_text(self, filelocation, local=False):
        """Retrieves or uses local copy of vote pdf and converts into XML."""
        if not local:
            try:
                filename, response = self.urlretrieve(url=filelocation)
                vote_text = convert_pdf(filename, type="xml")
                os.remove(filename)
            except scrapelib.HTTPError:
                self.warning("Request failed: {}".format(filelocation))
                return
        else:
            vote_text = convert_pdf(filelocation, type="xml")
            os.remove(filelocation)
        return vote_text
github openstates / openstates / openstates / ne / people.py View on Github external
image=photo_url,
                    primary_org="legislature",
                )

                person.add_link(rep_url)
                person.add_source(rep_url)

                note = "Capitol Office"
                person.add_contact_detail(type="address", value=address, note=note)
                if phone:
                    person.add_contact_detail(type="voice", value=phone, note=note)
                if email:
                    person.add_contact_detail(type="email", value=email, note=note)

                yield person
            except scrapelib.HTTPError:
                self.warning("could not retrieve %s" % rep_url)
github openstates / openstates / openstates / ut / legislators.py View on Github external
}[leg_info["party"]]
            photo_url = leg_info["image"]
            leg_id = leg_info["id"]
            
            if leg_info["house"] == "H":
                leg_url = house_base_url + "detail.jsp?i=" + leg_id
                leg = Legislator(term, 'lower', district, leg_name,
                         party=party, photo_url=photo_url, url=leg_url)
                leg.add_source(leg_url)
                leg = self.scrape_house_member(leg_url, leg)
            else:
                leg_url = (senate_base_url +
                        "senators/district{dist}.html".format(dist=district))
                try:
                    self.head(leg_url)
                except HTTPError:
                    warning_text = "Bad link for {sen}".format(sen=leg_name)
                    self.logger.warning(warning_text)

                    leg = Legislator(term, 'upper', district, leg_name,
                         party=party, photo_url=photo_url)
                else:
                    leg = Legislator(term, 'upper', district, leg_name,
                         party=party, photo_url=photo_url,url=leg_url)
                    leg.add_source(leg_url)

                address = leg_info.get('address', None)
                fax = leg_info.get('fax', None)
                cell = leg_info.get('cell', None)
                home_phone = leg_info.get('homePhone', None)
                work_phone = leg_info.get('workPhone', None)
github openstates / openstates / openstates / oh / bills.py View on Github external
votes = vote_doc.json()
                yield from self.process_vote(
                    votes,
                    vote_url,
                    base_url,
                    bill,
                    legislators,
                    chamber_dict,
                    vote_results,
                )

                vote_url = base_url
                vote_url += bill_version["cmtevotes"][0]["link"]
                try:
                    vote_doc = self.get(vote_url)
                except scrapelib.HTTPError:
                    self.warning(
                        "Vote page not " "loading; skipping: {}".format(vote_url)
                    )
                    continue
                votes = vote_doc.json()
                yield from self.process_vote(
                    votes,
                    vote_url,
                    base_url,
                    bill,
                    legislators,
                    chamber_dict,
                    vote_results,
                )

                if data["items"][0]["effective_date"]:
github openstates / openstates / openstates / ks / committees.py View on Github external
# set to joint if we are using the special_committees
                com_chamber = (
                    "legislature" if com_type == "special_committees" else chamber
                )

                committee = Organization(
                    committee_data["TITLE"],
                    chamber=com_chamber,
                    classification="committee",
                )

                com_url = ksapi.url + "ctte/%s/" % committee_data["KPID"]
                try:
                    detail_json = self.get(com_url).text
                except scrapelib.HTTPError:
                    self.warning("error fetching committee %s" % com_url)
                    continue
                details = json.loads(detail_json)["content"]
                for chair in details["CHAIR"]:
                    if chair.get("FULLNAME", None):
                        chair_name = chair["FULLNAME"]
                    else:
                        chair_name = self.parse_kpid(chair["KPID"])
                        self.warning("no FULLNAME for %s", chair["KPID"])
                    committee.add_member(chair_name, "chairman")
                for vicechair in details["VICECHAIR"]:
                    committee.add_member(vicechair["FULLNAME"], "vice-chairman")
                for rankedmember in details["RMMEM"]:
                    committee.add_member(rankedmember["FULLNAME"], "ranking member")
                for member in details["MEMBERS"]:
                    committee.add_member(member["FULLNAME"])
github openstates / openstates / openstates / mn / committees.py View on Github external
html = self.get(url).text
        doc = lxml.html.fromstring(html)

        for com in doc.xpath('//h2[@class="commhighlight"]'):
            members_url = com.xpath(
                'following-sibling::p[1]/a[text()="Members"]/@href'
            )[0]

            com = Organization(com.text, chamber="lower", classification="committee")
            com.add_source(members_url)

            try:
                member_html = self.get(members_url).text
                mdoc = lxml.html.fromstring(member_html)
            except HTTPError:
                self.warning(
                    "Member list for {} failed to respond; skipping".format(com.name)
                )
                continue

            # each legislator in their own table
            # first row, second column contains all the info
            for ltable in mdoc.xpath("//table/tr[1]/td[2]/p/b[1]"):

                # name is tail string of last element
                name = ltable.text_content()
                text = ltable.text
                if text and name != text:
                    name = name.replace(text, "")

                # role is inside a nested b tag
github openstates / openstates / openstates / ut / bills.py View on Github external
def parse_html_vote(self, bill, actor, date, motion, url, uniqid):
        try:
            page = self.get(url).text
        except scrapelib.HTTPError:
            self.warning("A vote page not found for bill {}".format(bill.identifier))
            return
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        descr = page.xpath("//b")[0].text_content()
        if descr == "":
            # New page method
            descr = page.xpath("//center")[0].text

        if "on voice vote" in descr:
            return

        if "committee" in descr.lower():
            yield from self.scrape_committee_vote(
                bill, actor, date, motion, page, url, uniqid
            )