Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for bill_id in self.scrape_page(SubjectBillListPage, url=link.get("href")):
subjects[bill_id].append(link.text)
return subjects
class SubjectBillListPage(Page, Spatula):
def handle_page(self):
for bill in self.doc.xpath('//ul[@class="linkSect"]/li'):
link = bill.getchildren()[0]
yield str(link.text_content())
next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
if next_url:
yield from self.scrape_page_items(SubjectBillListPage, url=next_url[0])
class BillListPage(Page, Spatula):
def handle_page(self):
bills = self.doc.xpath('//ul[@class="linkSect"]/li')
for bill in bills:
link = bill.getchildren()[0]
bill_id = str(link.text_content())
if not bill_id.startswith(("S", "H")):
continue
# create a bill
desc = bill.xpath("text()")[0].strip()
chamber = {"H": "lower", "S": "upper"}[bill_id[0]]
bill_type = {"B": "bill", "J": "joint resolution", "R": "resolution"}[
bill_id[1]
]
bill = Bill(
from pupa.scrape import Scraper, Organization
from spatula import Spatula, Page
from .utils import fix_name
class HouseComList(Page):
url = "http://www.myfloridahouse.gov/Sections/Committees/committees.aspx"
list_xpath = "//a[contains(@href, 'committeesdetail.aspx')]"
def handle_page(self):
# don't use handle_page_item because we need to look back at prior element
parent = None
for item in self.doc.xpath(self.list_xpath):
cssclass = item.attrib.get("class", "")
name = item.text_content().strip()
if "parentcommittee" in cssclass:
parent = None
chamber = "lower"
comm = Organization(
address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format(**row)
leg.add_contact_detail(type="address", value=address, note="district")
else:
address = "{Address}\n{City}, {State} {Zipcode}".format(**row)
leg.add_contact_detail(type="address", value=address, note="district")
leg.add_source(self.url)
leg.add_source(self._html_url)
return leg
def handle_page(self):
yield super(SenList, self).handle_page()
class RepList(Page):
url = "http://www.house.leg.state.mn.us/members/hmem.asp"
list_xpath = '//div[@id="Alpha"]//div[@class="media my-3"]'
def handle_list_item(self, item):
photo_url = item.xpath("./img/@src")[0]
url = item.xpath(".//h5/a/@href")[0]
name_text = item.xpath(".//h5/a/b/text()")[0]
name_match = re.match(r"^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$", name_text)
name = name_match.group(1).strip()
district = name_match.group(2).lstrip("0").upper()
party_text = name_match.group(3)
party = PARTIES[party_text]
info_texts = [
x.strip()
self.obj.add_member(name)
self.obj.add_source(self.url)
yield self.obj
class SenComList(Page):
url = "http://www.flsenate.gov/Committees/"
list_xpath = "//a[contains(@href, 'Committees/Show')]/@href"
def handle_list_item(self, item):
return self.scrape_page(SenComDetail, item)
class SenComDetail(Page):
def clean_name(self, member):
member = member.replace("Senator ", "").strip()
member = member.replace(" (D)", "")
member = member.replace(" (R)", "")
member = member.replace(" (I)", "")
member = fix_name(member)
return member
def handle_page(self):
name = self.doc.xpath('//h2[@class="committeeName"]')[0].text
if name.startswith("Appropriations Subcommittee"):
return
# TODO: restore scraping of Appropriations Subcommittees
# name = name.replace('Appropriations ', '')
# parent = {'name': 'Appropriations', 'classification': 'upper'}
# chamber = None
if "parentcommittee" in cssclass:
parent = None
chamber = "lower"
comm = Organization(
name=name, classification="committee", chamber=chamber, parent_id=parent
)
yield self.scrape_page(HouseComDetail, item.attrib["href"], obj=comm)
# parent for next time
if "parentcommittee" in cssclass:
parent = comm._id
chamber = None
class HouseComDetail(Page):
def clean_name(self, name):
name = name.replace(" [D]", "")
name = name.replace(" [R]", "")
name = name.replace(" [I]", "")
name = name.strip()
name = fix_name(name)
return name
def handle_page(self):
name = self.doc.xpath('//h1[@class="cd_ribbon"]')[0].text
for lm in self.doc.xpath('//div[@class="cd_LeaderMember"]'):
role = lm.xpath('.//div[@class="cd_LeaderTitle"]')[0].text_content().strip()
name = (
lm.xpath('.//span[@class="cd_LeaderTitle"]/a')[0].text_content().strip()
)
class DelegateDetail(MemberDetail):
role = "Delegate"
chamber = "lower"
def get_photo_url(self):
lis_id = get_lis_id(self.chamber, self.url)
if lis_id:
lis_id = "{}{:04d}".format(lis_id[0], int(lis_id[1:]))
return (
"http://memdata.virginiageneralassembly.gov" "/images/display_image/{}"
).format(lis_id)
class MemberList(Page):
def handle_list_item(self, item):
name = item.text
lname = name.lower()
if "resigned" in lname or "vacated" in lname or "retired" in lname:
return
if name in CHAMBER_MOVES and (self.chamber != CHAMBER_MOVES[name]):
return
name, action, date = clean_name(name)
leg = Person(name=name)
leg.add_source(self.url)
leg.add_source(item.get("href"))
leg.add_link(item.get("href"))
yield from self.scrape_page(
name = (
lm.xpath('.//span[@class="cd_LeaderTitle"]/a')[0].text_content().strip()
)
name = self.clean_name(name)
self.obj.add_member(name, role=role)
for cm in self.doc.xpath('//p[@class="cd_committeemembers"]//a'):
name = self.clean_name(cm.text_content())
self.obj.add_member(name)
self.obj.add_source(self.url)
yield self.obj
class SenComList(Page):
url = "http://www.flsenate.gov/Committees/"
list_xpath = "//a[contains(@href, 'Committees/Show')]/@href"
def handle_list_item(self, item):
return self.scrape_page(SenComDetail, item)
class SenComDetail(Page):
def clean_name(self, member):
member = member.replace("Senator ", "").strip()
member = member.replace(" (D)", "")
member = member.replace(" (R)", "")
member = member.replace(" (I)", "")
member = fix_name(member)
return member