How to use the scrapelib.HTTPError function in scrapelib

To help you get started, we’ve selected a few scrapelib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openstates / openstates / openstates / nm / votes.py View on Github external
def scrape_vote_text(self, filelocation, local=False):
        """Retrieves or uses local copy of vote pdf and converts into XML."""
        if not local:
            try:
                filename, response = self.urlretrieve(url=filelocation)
                vote_text = convert_pdf(filename, type="xml")
                os.remove(filename)
            except scrapelib.HTTPError:
                self.warning("Request failed: {}".format(filelocation))
                return
        else:
            vote_text = convert_pdf(filelocation, type="xml")
            os.remove(filelocation)
        return vote_text
github openstates / openstates / openstates / ne / people.py View on Github external
image=photo_url,
                    primary_org="legislature",
                )

                person.add_link(rep_url)
                person.add_source(rep_url)

                note = "Capitol Office"
                person.add_contact_detail(type="address", value=address, note=note)
                if phone:
                    person.add_contact_detail(type="voice", value=phone, note=note)
                if email:
                    person.add_contact_detail(type="email", value=email, note=note)

                yield person
            except scrapelib.HTTPError:
                self.warning("could not retrieve %s" % rep_url)
github openstates / openstates / openstates / ut / legislators.py View on Github external
}[leg_info["party"]]
            photo_url = leg_info["image"]
            leg_id = leg_info["id"]
            
            if leg_info["house"] == "H":
                leg_url = house_base_url + "detail.jsp?i=" + leg_id
                leg = Legislator(term, 'lower', district, leg_name,
                         party=party, photo_url=photo_url, url=leg_url)
                leg.add_source(leg_url)
                leg = self.scrape_house_member(leg_url, leg)
            else:
                leg_url = (senate_base_url +
                        "senators/district{dist}.html".format(dist=district))
                try:
                    self.head(leg_url)
                except HTTPError:
                    warning_text = "Bad link for {sen}".format(sen=leg_name)
                    self.logger.warning(warning_text)

                    leg = Legislator(term, 'upper', district, leg_name,
                         party=party, photo_url=photo_url)
                else:
                    leg = Legislator(term, 'upper', district, leg_name,
                         party=party, photo_url=photo_url,url=leg_url)
                    leg.add_source(leg_url)

                address = leg_info.get('address', None)
                fax = leg_info.get('fax', None)
                cell = leg_info.get('cell', None)
                home_phone = leg_info.get('homePhone', None)
                work_phone = leg_info.get('workPhone', None)
github openstates / openstates / openstates / oh / bills.py View on Github external
votes = vote_doc.json()
                yield from self.process_vote(
                    votes,
                    vote_url,
                    base_url,
                    bill,
                    legislators,
                    chamber_dict,
                    vote_results,
                )

                vote_url = base_url
                vote_url += bill_version["cmtevotes"][0]["link"]
                try:
                    vote_doc = self.get(vote_url)
                except scrapelib.HTTPError:
                    self.warning(
                        "Vote page not " "loading; skipping: {}".format(vote_url)
                    )
                    continue
                votes = vote_doc.json()
                yield from self.process_vote(
                    votes,
                    vote_url,
                    base_url,
                    bill,
                    legislators,
                    chamber_dict,
                    vote_results,
                )

                if data["items"][0]["effective_date"]:
github openstates / openstates / openstates / ks / committees.py View on Github external
# set to joint if we are using the special_committees
                com_chamber = (
                    "legislature" if com_type == "special_committees" else chamber
                )

                committee = Organization(
                    committee_data["TITLE"],
                    chamber=com_chamber,
                    classification="committee",
                )

                com_url = ksapi.url + "ctte/%s/" % committee_data["KPID"]
                try:
                    detail_json = self.get(com_url).text
                except scrapelib.HTTPError:
                    self.warning("error fetching committee %s" % com_url)
                    continue
                details = json.loads(detail_json)["content"]
                for chair in details["CHAIR"]:
                    if chair.get("FULLNAME", None):
                        chair_name = chair["FULLNAME"]
                    else:
                        chair_name = self.parse_kpid(chair["KPID"])
                        self.warning("no FULLNAME for %s", chair["KPID"])
                    committee.add_member(chair_name, "chairman")
                for vicechair in details["VICECHAIR"]:
                    committee.add_member(vicechair["FULLNAME"], "vice-chairman")
                for rankedmember in details["RMMEM"]:
                    committee.add_member(rankedmember["FULLNAME"], "ranking member")
                for member in details["MEMBERS"]:
                    committee.add_member(member["FULLNAME"])
github openstates / openstates / openstates / mn / committees.py View on Github external
html = self.get(url).text
        doc = lxml.html.fromstring(html)

        for com in doc.xpath('//h2[@class="commhighlight"]'):
            members_url = com.xpath(
                'following-sibling::p[1]/a[text()="Members"]/@href'
            )[0]

            com = Organization(com.text, chamber="lower", classification="committee")
            com.add_source(members_url)

            try:
                member_html = self.get(members_url).text
                mdoc = lxml.html.fromstring(member_html)
            except HTTPError:
                self.warning(
                    "Member list for {} failed to respond; skipping".format(com.name)
                )
                continue

            # each legislator in their own table
            # first row, second column contains all the info
            for ltable in mdoc.xpath("//table/tr[1]/td[2]/p/b[1]"):

                # name is tail string of last element
                name = ltable.text_content()
                text = ltable.text
                if text and name != text:
                    name = name.replace(text, "")

                # role is inside a nested b tag
github openstates / openstates / openstates / ut / bills.py View on Github external
def parse_html_vote(self, bill, actor, date, motion, url, uniqid):
        try:
            page = self.get(url).text
        except scrapelib.HTTPError:
            self.warning("A vote page not found for bill {}".format(bill.identifier))
            return
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        descr = page.xpath("//b")[0].text_content()
        if descr == "":
            # New page method
            descr = page.xpath("//center")[0].text

        if "on voice vote" in descr:
            return

        if "committee" in descr.lower():
            yield from self.scrape_committee_vote(
                bill, actor, date, motion, page, url, uniqid
            )
github openstates / openstates / openstates / ma / votes.py View on Github external
def scrape_vote(self, session, rollcall_number):

        # Fetch this piece of garbage.
        url = (
            'http://www.mass.gov/legis/journal/RollCallPdfs/'
            '{session}/{rollcall}.pdf?Session={session}&RollCall={rollcall}')
        url_args = dict(
            session=re.findall(r'\d+', session).pop(),
            rollcall=str(rollcall_number).zfill(5))
        url = url.format(**url_args)

        try:
            vote_file, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            # We'll hit a 404 at the end of the votes.
            self.warning('Stopping; encountered a 404 at %s' % url)
            raise self.EndOfHouseVotes

        text = convert_pdf(vote_file, type='text')
        text = text.decode('utf8')

        # A hack to guess whether this PDF has embedded images or contains
        # machine readable text.
        if len(re.findall(r'[YNPX]', text)) > 157:
            vote = self.house_get_vote(text, vote_file, session)
        else:
            vote = self.house_get_vote_with_images(text, vote_file, session)
            self.house_add_votes_from_image(vote_file, vote)

        vote.add_source(url)
github opencivicdata / python-legistar-scraper / legistar / base.py View on Github external
page = lxml.html.fromstring(response.text)
            returned_range, = page.xpath(
                "//input[@id='ctl00_ContentPlaceHolder1_lstYears_Input']")

            returned_range = returned_range.value

            if returned_range != expected_range:
                response.status_code = 520
                # In the event of a retry, the new request does not
                # contain the correct payload data.  This comes as a
                # result of not updating the payload via sessionSecrets:
                # so, we do that here.
                payload.update(self.sessionSecrets(page))

                raise scrapelib.HTTPError(response)
github openstates / openstates / openstates / ut / events.py View on Github external
# if not when.endswith(session[ :len("20XX")]):
            #    continue

            event = Event(
                name=title,
                start_date=self._tz.localize(
                    datetime.datetime.strptime(when, "%b %d, %Y")
                ),
                location_name="State Capitol",
            )
            event.add_source(URL)

            url = re.search(r"(http://.*?)\s", info.text_content()).group(1)
            try:
                doc = self.lxmlize(url)
            except HTTPError:
                self.logger.warning("Page missing, skipping")
                continue
            event.add_source(url)

            committee = doc.xpath('//a[text()="View committee page"]/@href')
            if committee:
                committee_doc = self.lxmlize(committee[0])
                committee_name = committee_doc.xpath(
                    '//h3[@class="heading committee"]/text()'
                )[0].strip()
                event.add_participant(committee_name, type="committee", note="host")

            documents = doc.xpath(".//td")
            for document in documents:
                url = re.search(r"(http://.*?pdf)", document.xpath("@onclick")[0])
                if url is None: