Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def scrape_vote_text(self, filelocation, local=False):
"""Retrieves or uses local copy of vote pdf and converts into XML."""
if not local:
try:
filename, response = self.urlretrieve(url=filelocation)
vote_text = convert_pdf(filename, type="xml")
os.remove(filename)
except scrapelib.HTTPError:
self.warning("Request failed: {}".format(filelocation))
return
else:
vote_text = convert_pdf(filelocation, type="xml")
os.remove(filelocation)
return vote_text
image=photo_url,
primary_org="legislature",
)
person.add_link(rep_url)
person.add_source(rep_url)
note = "Capitol Office"
person.add_contact_detail(type="address", value=address, note=note)
if phone:
person.add_contact_detail(type="voice", value=phone, note=note)
if email:
person.add_contact_detail(type="email", value=email, note=note)
yield person
except scrapelib.HTTPError:
self.warning("could not retrieve %s" % rep_url)
}[leg_info["party"]]
photo_url = leg_info["image"]
leg_id = leg_info["id"]
if leg_info["house"] == "H":
leg_url = house_base_url + "detail.jsp?i=" + leg_id
leg = Legislator(term, 'lower', district, leg_name,
party=party, photo_url=photo_url, url=leg_url)
leg.add_source(leg_url)
leg = self.scrape_house_member(leg_url, leg)
else:
leg_url = (senate_base_url +
"senators/district{dist}.html".format(dist=district))
try:
self.head(leg_url)
except HTTPError:
warning_text = "Bad link for {sen}".format(sen=leg_name)
self.logger.warning(warning_text)
leg = Legislator(term, 'upper', district, leg_name,
party=party, photo_url=photo_url)
else:
leg = Legislator(term, 'upper', district, leg_name,
party=party, photo_url=photo_url,url=leg_url)
leg.add_source(leg_url)
address = leg_info.get('address', None)
fax = leg_info.get('fax', None)
cell = leg_info.get('cell', None)
home_phone = leg_info.get('homePhone', None)
work_phone = leg_info.get('workPhone', None)
votes = vote_doc.json()
yield from self.process_vote(
votes,
vote_url,
base_url,
bill,
legislators,
chamber_dict,
vote_results,
)
vote_url = base_url
vote_url += bill_version["cmtevotes"][0]["link"]
try:
vote_doc = self.get(vote_url)
except scrapelib.HTTPError:
self.warning(
"Vote page not " "loading; skipping: {}".format(vote_url)
)
continue
votes = vote_doc.json()
yield from self.process_vote(
votes,
vote_url,
base_url,
bill,
legislators,
chamber_dict,
vote_results,
)
if data["items"][0]["effective_date"]:
# set to joint if we are using the special_committees
com_chamber = (
"legislature" if com_type == "special_committees" else chamber
)
committee = Organization(
committee_data["TITLE"],
chamber=com_chamber,
classification="committee",
)
com_url = ksapi.url + "ctte/%s/" % committee_data["KPID"]
try:
detail_json = self.get(com_url).text
except scrapelib.HTTPError:
self.warning("error fetching committee %s" % com_url)
continue
details = json.loads(detail_json)["content"]
for chair in details["CHAIR"]:
if chair.get("FULLNAME", None):
chair_name = chair["FULLNAME"]
else:
chair_name = self.parse_kpid(chair["KPID"])
self.warning("no FULLNAME for %s", chair["KPID"])
committee.add_member(chair_name, "chairman")
for vicechair in details["VICECHAIR"]:
committee.add_member(vicechair["FULLNAME"], "vice-chairman")
for rankedmember in details["RMMEM"]:
committee.add_member(rankedmember["FULLNAME"], "ranking member")
for member in details["MEMBERS"]:
committee.add_member(member["FULLNAME"])
html = self.get(url).text
doc = lxml.html.fromstring(html)
for com in doc.xpath('//h2[@class="commhighlight"]'):
members_url = com.xpath(
'following-sibling::p[1]/a[text()="Members"]/@href'
)[0]
com = Organization(com.text, chamber="lower", classification="committee")
com.add_source(members_url)
try:
member_html = self.get(members_url).text
mdoc = lxml.html.fromstring(member_html)
except HTTPError:
self.warning(
"Member list for {} failed to respond; skipping".format(com.name)
)
continue
# each legislator in their own table
# first row, second column contains all the info
for ltable in mdoc.xpath("//table/tr[1]/td[2]/p/b[1]"):
# name is tail string of last element
name = ltable.text_content()
text = ltable.text
if text and name != text:
name = name.replace(text, "")
# role is inside a nested b tag
def parse_html_vote(self, bill, actor, date, motion, url, uniqid):
try:
page = self.get(url).text
except scrapelib.HTTPError:
self.warning("A vote page not found for bill {}".format(bill.identifier))
return
page = lxml.html.fromstring(page)
page.make_links_absolute(url)
descr = page.xpath("//b")[0].text_content()
if descr == "":
# New page method
descr = page.xpath("//center")[0].text
if "on voice vote" in descr:
return
if "committee" in descr.lower():
yield from self.scrape_committee_vote(
bill, actor, date, motion, page, url, uniqid
)
def scrape_vote(self, session, rollcall_number):
# Fetch this piece of garbage.
url = (
'http://www.mass.gov/legis/journal/RollCallPdfs/'
'{session}/{rollcall}.pdf?Session={session}&RollCall={rollcall}')
url_args = dict(
session=re.findall(r'\d+', session).pop(),
rollcall=str(rollcall_number).zfill(5))
url = url.format(**url_args)
try:
vote_file, resp = self.urlretrieve(url)
except scrapelib.HTTPError:
# We'll hit a 404 at the end of the votes.
self.warning('Stopping; encountered a 404 at %s' % url)
raise self.EndOfHouseVotes
text = convert_pdf(vote_file, type='text')
text = text.decode('utf8')
# A hack to guess whether this PDF has embedded images or contains
# machine readable text.
if len(re.findall(r'[YNPX]', text)) > 157:
vote = self.house_get_vote(text, vote_file, session)
else:
vote = self.house_get_vote_with_images(text, vote_file, session)
self.house_add_votes_from_image(vote_file, vote)
vote.add_source(url)
page = lxml.html.fromstring(response.text)
returned_range, = page.xpath(
"//input[@id='ctl00_ContentPlaceHolder1_lstYears_Input']")
returned_range = returned_range.value
if returned_range != expected_range:
response.status_code = 520
# In the event of a retry, the new request does not
# contain the correct payload data. This comes as a
# result of not updating the payload via sessionSecrets:
# so, we do that here.
payload.update(self.sessionSecrets(page))
raise scrapelib.HTTPError(response)
# if not when.endswith(session[ :len("20XX")]):
# continue
event = Event(
name=title,
start_date=self._tz.localize(
datetime.datetime.strptime(when, "%b %d, %Y")
),
location_name="State Capitol",
)
event.add_source(URL)
url = re.search(r"(http://.*?)\s", info.text_content()).group(1)
try:
doc = self.lxmlize(url)
except HTTPError:
self.logger.warning("Page missing, skipping")
continue
event.add_source(url)
committee = doc.xpath('//a[text()="View committee page"]/@href')
if committee:
committee_doc = self.lxmlize(committee[0])
committee_name = committee_doc.xpath(
'//h3[@class="heading committee"]/text()'
)[0].strip()
event.add_participant(committee_name, type="committee", note="host")
documents = doc.xpath(".//td")
for document in documents:
url = re.search(r"(http://.*?pdf)", document.xpath("@onclick")[0])
if url is None: