How to use the spatula.Spatula function in spatula

To help you get started, we’ve selected a few spatula examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openstates / openstates / scrapers / va / bills.py View on Github external
if block:
            block = block[0].text.replace('\r\n', ' ')

            pieces = block.split('--')
            # if there are only two pieces, there are no abstentions
            if len(pieces) <= 2:
                return []
            else:
                # lookahead and don't split if comma precedes initials
                # Also, Bell appears as Bell, Richard B. and Bell, Robert P.
                # and so needs the lookbehind assertion.
                return [x.strip() for x in re.split('(?
github openstates / openstates / scrapers / va / bills.py View on Github external
subjects = collections.defaultdict(list)
        for link in self.doc.xpath('//ul[@class="linkSect"]/li/a'):
            for bill_id in self.scrape_page(SubjectBillListPage, url=link.get('href')):
                subjects[bill_id].append(link.text)
        return subjects

class SubjectBillListPage(Page, Spatula):
    def handle_page(self):
        for bill in self.doc.xpath('//ul[@class="linkSect"]/li'):
            link = bill.getchildren()[0]
            yield str(link.text_content())
        next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
        if next_url:
            yield from self.scrape_page_items(SubjectBillListPage, url=next_url[0])

class BillListPage(Page, Spatula):
    def handle_page(self):
        bills = self.doc.xpath('//ul[@class="linkSect"]/li')
        for bill in bills:
            link = bill.getchildren()[0]
            bill_id = str(link.text_content())

            if not bill_id.startswith('S') or bill_id.startswith('H'):
                continue

            # create a bill
            desc = bill.xpath('text()')[0].strip()
            chamber = {
                'H': 'lower',
                'S': 'upper',
            }[bill_id[0]]
            bill_type = {'B': 'bill',
github openstates / openstates / openstates / va / bills.py View on Github external
("Committee substitute printed", SKIP),
    ("Bill text as passed", SKIP),
    ("Acts of Assembly", SKIP),
)


class SubjectPage(Page, Spatula):
    def handle_page(self):
        subjects = collections.defaultdict(list)
        for link in self.doc.xpath('//ul[@class="linkSect"]/li/a'):
            for bill_id in self.scrape_page(SubjectBillListPage, url=link.get("href")):
                subjects[bill_id].append(link.text)
        return subjects


class SubjectBillListPage(Page, Spatula):
    def handle_page(self):
        for bill in self.doc.xpath('//ul[@class="linkSect"]/li'):
            link = bill.getchildren()[0]
            yield str(link.text_content())
        next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
        if next_url:
            yield from self.scrape_page_items(SubjectBillListPage, url=next_url[0])


class BillListPage(Page, Spatula):
    def handle_page(self):
        bills = self.doc.xpath('//ul[@class="linkSect"]/li')
        for bill in bills:
            link = bill.getchildren()[0]
            bill_id = str(link.text_content())
github openstates / openstates / openstates / fl / committees.py View on Github external
for dt in self.doc.xpath('//div[@id="members"]/dl/dt'):
            role = dt.text.replace(": ", "").strip().lower()
            member = dt.xpath("./following-sibling::dd")[0].text_content()
            member = self.clean_name(member)
            comm.add_member(member, role=role)

        for ul in self.doc.xpath('//div[@id="members"]/ul/li'):
            member = self.clean_name(ul.text_content())
            comm.add_member(member)

        comm.add_source(self.url)

        yield comm


class FlCommitteeScraper(Scraper, Spatula):
    def scrape(self):
        yield from self.scrape_page_items(SenComList)
        yield from self.scrape_page_items(HouseComList)
github openstates / openstates / openstates / va / people.py View on Github external
return name, action, date


class SenateList(MemberList):
    chamber = "upper"
    detail_page = SenateDetail
    list_xpath = '//div[@class="lColRt"]/ul/li/a'


class DelegateList(MemberList):
    chamber = "lower"
    detail_page = DelegateDetail
    list_xpath = '//div[@class="lColLt"]/ul/li/a'


class VaPersonScraper(Scraper, Spatula):
    def scrape(self, session=None):
        if not session:
            session = self.jurisdiction.legislative_sessions[-1]
            self.info("no session specified, using %s", session["identifier"])
        url = "http://lis.virginia.gov/{}/mbr/MBR.HTM".format(
            SESSION_SITE_IDS[session["identifier"]]
        )
        committees = {}
        yield from self.scrape_page_items(
            SenateList, session=session, url=url, committees=committees
        )
        yield from self.scrape_page_items(
            DelegateList, session=session, url=url, committees=committees
        )
        for committee in committees.values():
            yield committee
github openstates / openstates / scrapers / va / bills.py View on Github external
('Read third time and agreed', ['passage', 'reading-3']),
    ('Passed (Senate|House)', 'passage'),
    ('Read third time and defeated', 'failure'),
    ('Presented', 'introduction'),
    ('Prefiled and ordered printed', 'introduction'),
    ('Read first time', 'reading-1'),
    ('Read second time', 'reading-2'),
    ('Read third time', 'reading-3'),
    ('Senators: ', None),
    ('Delegates: ', None),
    ('Committee substitute printed', None),
    ('Bill text as passed', None),
    ('Acts of Assembly', None),
)

class SubjectPage(Page, Spatula):
    def handle_page(self):
        subjects = collections.defaultdict(list)
        for link in self.doc.xpath('//ul[@class="linkSect"]/li/a'):
            for bill_id in self.scrape_page(SubjectBillListPage, url=link.get('href')):
                subjects[bill_id].append(link.text)
        return subjects

class SubjectBillListPage(Page, Spatula):
    def handle_page(self):
        for bill in self.doc.xpath('//ul[@class="linkSect"]/li'):
            link = bill.getchildren()[0]
            yield str(link.text_content())
        next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
        if next_url:
            yield from self.scrape_page_items(SubjectBillListPage, url=next_url[0])
github openstates / openstates / openstates / va / bills.py View on Github external
for source in vote.sources:
        if "+vot+" in source["url"]:
            vote.pupa_id = source["url"]
            break
    else:
        vote.pupa_id = None

    if vote.pupa_id in _seen_pupa_ids:
        # skip over votes we've already seen
        return
    else:
        _seen_pupa_ids.add(vote.pupa_id)
        yield vote


class VaBillScraper(Scraper, Spatula):
    def scrape(self, session=None):
        if not session:
            session = self.jurisdiction.legislative_sessions[-1]["identifier"]
            self.info("no session specified, using %s", session)
        session_id = SESSION_SITE_IDS[session]
        url = BASE_URL + URL_PATTERNS["list"].format(session_id)
        subject_url = BASE_URL + URL_PATTERNS["subjects"].format(session_id)
        subjects = self.scrape_page(SubjectPage, url=subject_url)
        yield from self.scrape_page_items(
            BillListPage,
            url=url,
            session=session,
            session_id=session_id,
            subjects=subjects,
        )
github openstates / openstates / openstates / fl / bills.py View on Github external
subject = None

        for line in self.lines:
            if SUBJ_RE.match(line):
                subject = line.lower().strip()
            elif subject and BILL_RE.findall(line):
                for bill in BILL_RE.findall(line):
                    # normalize bill id to [SH]#
                    bill = bill.replace("-", "")
                    subjects[bill].add(subject)

        return subjects


class FlBillScraper(Scraper, Spatula):
    def scrape(self, session=None):
        # FL published a bad bill in 2019, #143
        self.raise_errors = False
        self.retry_attempts = 1
        self.retry_wait_seconds = 3

        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)

        subject_url = "http://www.leg.state.fl.us/data/session/{}/citator/Daily/subindex.pdf".format(
            session
        )
        subjects = self.scrape_page(SubjectPDF, subject_url)

        url = "http://flsenate.gov/Session/Bills/{}?chamber=both".format(session)
github openstates / openstates / scrapers / va / bills.py View on Github external
sponsor_url = BASE_URL + URL_PATTERNS['sponsors'].format(
                self.kwargs['session'],
                bill_id.replace(' ', ''),
            )

            list(self.scrape_page_items(BillSponsorPage, url=sponsor_url, obj=bill))
            list(self.scrape_page_items(BillDetailPage, url=bill_url, obj=bill))
            bill.subject = self.kwargs['subjects'][bill_id]
            bill.add_source(bill_url)
            yield bill

        next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
        if next_url:
            yield from self.scrape_page_items(BillListPage, url=next_url[0], **self.kwargs)

class BillSponsorPage(Page, Spatula):
    def handle_page(self):
        for slist in self.doc.xpath('//ul[@class="linkSect"]'):
            # note that first ul is origin chamber
            for sponsor in slist.xpath('li'):
                name = sponsor.text_content().strip()
                if name.endswith(u' (chief\xa0patron)'):
                    name = name[:-15]
                    type = 'primary'
                elif name.endswith(u' (chief\xa0co-patron)'):
                    name = name[:-18]
                    type = 'cosponsor'
                else:
                    type = 'cosponsor'
                self.obj.add_sponsorship(name, type, 'person', type == 'primary')
                yield self.obj

spatula

A modern Python library for writing maintainable web scrapers.

MIT
Latest version published 5 months ago

Package Health Score

66 / 100
Full package analysis