How to use spatula - 10 common examples

To help you get started, we’ve selected a few spatula examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openstates / openstates / scrapers / va / bills.py View on Github external
if block:
            block = block[0].text.replace('\r\n', ' ')

            pieces = block.split('--')
            # if there are only two pieces, there are no abstentions
            if len(pieces) <= 2:
                return []
            else:
                # lookahead and don't split if comma precedes initials
                # Also, Bell appears as Bell, Richard B. and Bell, Robert P.
                # and so needs the lookbehind assertion.
                return [x.strip() for x in re.split('(?
github openstates / openstates / openstates / fl / bills.py View on Github external
raise ValueError("vote count incorrect: " + self.url)

        if nv_count != 0:
            # On a rare occasion, a member won't have a vote code,
            # which indicates that they didn't vote. The totals reflect
            # this.
            self.scraper.info("Votes don't add up; looking for additional ones")
            for line in self.lines[VOTE_START_INDEX:]:
                if not line.strip():
                    break
                for member in re.findall(r"\s{8,}([A-Z][a-z\'].*?)-\d{1,3}", line):
                    vote.vote("not voting", member)
        yield vote


class UpperComVote(PDF):
    def handle_page(self):
        (_, motion) = self.lines[5].split("FINAL ACTION:")
        motion = motion.strip()
        if not motion:
            self.scraper.warning("Vote appears to be empty")
            return

        vote_top_row = [
            self.lines.index(x)
            for x in self.lines
            if re.search(r"^\s+Yea\s+Nay.*?(?:\s+Yea\s+Nay)+$", x)
        ][0]
        yea_columns_end = self.lines[vote_top_row].index("Yea") + len("Yea")
        nay_columns_begin = self.lines[vote_top_row].index("Nay")

        votes = {"yes": [], "no": [], "other": []}
github openstates / openstates / openstates / va / bills.py View on Github external
for bill_id in self.scrape_page(SubjectBillListPage, url=link.get("href")):
                subjects[bill_id].append(link.text)
        return subjects


class SubjectBillListPage(Page, Spatula):
    def handle_page(self):
        for bill in self.doc.xpath('//ul[@class="linkSect"]/li'):
            link = bill.getchildren()[0]
            yield str(link.text_content())
        next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
        if next_url:
            yield from self.scrape_page_items(SubjectBillListPage, url=next_url[0])


class BillListPage(Page, Spatula):
    def handle_page(self):
        bills = self.doc.xpath('//ul[@class="linkSect"]/li')
        for bill in bills:
            link = bill.getchildren()[0]
            bill_id = str(link.text_content())

            if not bill_id.startswith(("S", "H")):
                continue

            # create a bill
            desc = bill.xpath("text()")[0].strip()
            chamber = {"H": "lower", "S": "upper"}[bill_id[0]]
            bill_type = {"B": "bill", "J": "joint resolution", "R": "resolution"}[
                bill_id[1]
            ]
            bill = Bill(
github openstates / openstates / openstates / fl / committees.py View on Github external
from pupa.scrape import Scraper, Organization
from spatula import Spatula, Page
from .utils import fix_name


class HouseComList(Page):
    url = "http://www.myfloridahouse.gov/Sections/Committees/committees.aspx"
    list_xpath = "//a[contains(@href, 'committeesdetail.aspx')]"

    def handle_page(self):
        # don't use handle_page_item because we need to look back at prior element
        parent = None

        for item in self.doc.xpath(self.list_xpath):
            cssclass = item.attrib.get("class", "")
            name = item.text_content().strip()

            if "parentcommittee" in cssclass:
                parent = None
                chamber = "lower"

            comm = Organization(
github openstates / openstates / openstates / mn / people.py View on Github external
address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format(**row)
            leg.add_contact_detail(type="address", value=address, note="district")
        else:
            address = "{Address}\n{City}, {State} {Zipcode}".format(**row)
            leg.add_contact_detail(type="address", value=address, note="district")

        leg.add_source(self.url)
        leg.add_source(self._html_url)

        return leg

    def handle_page(self):
        yield super(SenList, self).handle_page()


class RepList(Page):
    url = "http://www.house.leg.state.mn.us/members/hmem.asp"
    list_xpath = '//div[@id="Alpha"]//div[@class="media my-3"]'

    def handle_list_item(self, item):
        photo_url = item.xpath("./img/@src")[0]
        url = item.xpath(".//h5/a/@href")[0]
        name_text = item.xpath(".//h5/a/b/text()")[0]

        name_match = re.match(r"^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$", name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip("0").upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [
            x.strip()
github openstates / openstates / openstates / fl / committees.py View on Github external
self.obj.add_member(name)

        self.obj.add_source(self.url)

        yield self.obj


class SenComList(Page):
    url = "http://www.flsenate.gov/Committees/"
    list_xpath = "//a[contains(@href, 'Committees/Show')]/@href"

    def handle_list_item(self, item):
        return self.scrape_page(SenComDetail, item)


class SenComDetail(Page):
    def clean_name(self, member):
        member = member.replace("Senator ", "").strip()
        member = member.replace(" (D)", "")
        member = member.replace(" (R)", "")
        member = member.replace(" (I)", "")
        member = fix_name(member)
        return member

    def handle_page(self):
        name = self.doc.xpath('//h2[@class="committeeName"]')[0].text
        if name.startswith("Appropriations Subcommittee"):
            return
            # TODO: restore scraping of Appropriations Subcommittees
            # name = name.replace('Appropriations ', '')
            # parent = {'name': 'Appropriations', 'classification': 'upper'}
            # chamber = None
github openstates / openstates / openstates / fl / committees.py View on Github external
if "parentcommittee" in cssclass:
                parent = None
                chamber = "lower"

            comm = Organization(
                name=name, classification="committee", chamber=chamber, parent_id=parent
            )
            yield self.scrape_page(HouseComDetail, item.attrib["href"], obj=comm)

            # parent for next time
            if "parentcommittee" in cssclass:
                parent = comm._id
                chamber = None


class HouseComDetail(Page):
    def clean_name(self, name):
        name = name.replace(" [D]", "")
        name = name.replace(" [R]", "")
        name = name.replace(" [I]", "")
        name = name.strip()
        name = fix_name(name)
        return name

    def handle_page(self):
        name = self.doc.xpath('//h1[@class="cd_ribbon"]')[0].text

        for lm in self.doc.xpath('//div[@class="cd_LeaderMember"]'):
            role = lm.xpath('.//div[@class="cd_LeaderTitle"]')[0].text_content().strip()
            name = (
                lm.xpath('.//span[@class="cd_LeaderTitle"]/a')[0].text_content().strip()
            )
github openstates / openstates / openstates / va / people.py View on Github external
class DelegateDetail(MemberDetail):
    role = "Delegate"
    chamber = "lower"

    def get_photo_url(self):
        lis_id = get_lis_id(self.chamber, self.url)
        if lis_id:
            lis_id = "{}{:04d}".format(lis_id[0], int(lis_id[1:]))
            return (
                "http://memdata.virginiageneralassembly.gov" "/images/display_image/{}"
            ).format(lis_id)


class MemberList(Page):
    def handle_list_item(self, item):
        name = item.text

        lname = name.lower()
        if "resigned" in lname or "vacated" in lname or "retired" in lname:
            return
        if name in CHAMBER_MOVES and (self.chamber != CHAMBER_MOVES[name]):
            return

        name, action, date = clean_name(name)

        leg = Person(name=name)
        leg.add_source(self.url)
        leg.add_source(item.get("href"))
        leg.add_link(item.get("href"))
        yield from self.scrape_page(
github openstates / openstates / scrapers / va / bills.py View on Github external
subjects = collections.defaultdict(list)
        for link in self.doc.xpath('//ul[@class="linkSect"]/li/a'):
            for bill_id in self.scrape_page(SubjectBillListPage, url=link.get('href')):
                subjects[bill_id].append(link.text)
        return subjects

class SubjectBillListPage(Page, Spatula):
    def handle_page(self):
        for bill in self.doc.xpath('//ul[@class="linkSect"]/li'):
            link = bill.getchildren()[0]
            yield str(link.text_content())
        next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
        if next_url:
            yield from self.scrape_page_items(SubjectBillListPage, url=next_url[0])

class BillListPage(Page, Spatula):
    def handle_page(self):
        bills = self.doc.xpath('//ul[@class="linkSect"]/li')
        for bill in bills:
            link = bill.getchildren()[0]
            bill_id = str(link.text_content())

            if not bill_id.startswith('S') or bill_id.startswith('H'):
                continue

            # create a bill
            desc = bill.xpath('text()')[0].strip()
            chamber = {
                'H': 'lower',
                'S': 'upper',
            }[bill_id[0]]
            bill_type = {'B': 'bill',
github openstates / openstates / openstates / fl / committees.py View on Github external
name = (
                lm.xpath('.//span[@class="cd_LeaderTitle"]/a')[0].text_content().strip()
            )
            name = self.clean_name(name)
            self.obj.add_member(name, role=role)

        for cm in self.doc.xpath('//p[@class="cd_committeemembers"]//a'):
            name = self.clean_name(cm.text_content())
            self.obj.add_member(name)

        self.obj.add_source(self.url)

        yield self.obj


class SenComList(Page):
    url = "http://www.flsenate.gov/Committees/"
    list_xpath = "//a[contains(@href, 'Committees/Show')]/@href"

    def handle_list_item(self, item):
        return self.scrape_page(SenComDetail, item)


class SenComDetail(Page):
    def clean_name(self, member):
        member = member.replace("Senator ", "").strip()
        member = member.replace(" (D)", "")
        member = member.replace(" (R)", "")
        member = member.replace(" (I)", "")
        member = fix_name(member)
        return member

spatula

A modern Python library for writing maintainable web scrapers.

MIT
Latest version published 2 years ago

Package Health Score

48 / 100
Full package analysis