How to use the pdfplumber.load function in pdfplumber

To help you get started, we’ve selected a few pdfplumber examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github BuzzFeedNews / nics-firearm-background-checks / scripts / parse-pdf.py View on Github external
def parse_pdf(file_obj):
    pdf = pdfplumber.load(file_obj)

    # Note: As of Nov. 2019 file, first page is documentation
    checks_gen = map(parse_page, pdf.pages[1:])
    checks = pd.concat(checks_gen).reset_index(drop=True)

    return checks[checks["state"] != "Totals"]
github jsvine / pdfplumber / pdfplumber / cli.py View on Github external
def main():
    args = parse_args()
    pdf = pdfplumber.load(args.infile, pages=args.pages)
    if args.format == "csv":
        to_csv(pdf, args.types, args.encoding)
    else:
        to_json(pdf, args.types, args.encoding)
github BuzzFeedNews / nics-firearm-background-checks / scripts / get-month.py View on Github external
import requests
import datetime
import re
from io import BytesIO

def parse_date(pdf):
    text = pdf.pages[0].extract_text(x_tolerance=5)
    date_pat = r"UPDATED:\s+As of (.+)\n"
    updated_date = re.search(date_pat, text).group(1)
    d = datetime.datetime.strptime(updated_date, "%B %d, %Y")
    return d

if __name__ == "__main__":
    URL = "https://www.fbi.gov/file-repository/active_records_in_the_nics-index.pdf"
    raw = requests.get(URL).content
    pdf = pdfplumber.load(BytesIO(raw))
    d = parse_date(pdf)
    print(d.strftime("%Y-%m"))