How to use the ocrmypdf.pdfinfo function in ocrmypdf

To help you get started, we’ve selected a few ocrmypdf examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github jbarlow83 / OCRmyPDF / tests / test_tess4.py View on Github external
def test_skip_pages_does_not_replicate(resources, basename, outdir):
    infile = resources / basename
    outpdf = outdir / basename

    check_ocrmypdf(
        infile,
        outpdf,
        '--pdf-renderer',
        'sandwich',
        '--force-ocr',
        '--tesseract-timeout',
        '0',
    )

    info_in = pdfinfo.PdfInfo(infile)

    info = pdfinfo.PdfInfo(outpdf)
    for page in info:
        assert len(page.images) == 1, "skipped page was replicated"

    for n in range(len(info_in)):
        assert info[n].width_inches == info_in[n].width_inches
github jbarlow83 / OCRmyPDF / tests / test_pdfinfo.py View on Github external
def test_pickle(resources):
    # For multiprocessing we must be able to pickle our information - if
    # this fails then we are probably storing some unpickleabe pikepdf or
    # other external data around
    filename = resources / 'graph_ocred.pdf'
    pdf = pdfinfo.PdfInfo(filename)
    pickle.dumps(pdf)
github jbarlow83 / OCRmyPDF / tests / test_tess4.py View on Github external
def test_content_preservation(resources, outpdf):
    infile = resources / 'masks.pdf'

    check_ocrmypdf(
        infile, outpdf, '--pdf-renderer', 'sandwich', '--tesseract-timeout', '0'
    )

    info = pdfinfo.PdfInfo(outpdf)
    page = info[0]
    assert len(page.images) > 1, "masks were rasterized"
github jbarlow83 / OCRmyPDF / tests / test_pdfinfo.py View on Github external
def test_single_page_text(outdir):
    filename = outdir / 'text.pdf'
    pdf = Canvas(str(filename), pagesize=(8 * 72, 6 * 72))
    text = pdf.beginText()
    text.setFont('Helvetica', 12)
    text.setTextOrigin(1 * 72, 3 * 72)
    text.textLine(
        "Methink'st thou art a general offence and every" " man should beat thee."
    )
    pdf.drawText(text)
    pdf.showPage()
    pdf.save()

    info = pdfinfo.PdfInfo(filename)

    assert len(info) == 1
    page = info[0]

    assert page.has_text
    assert len(page.images) == 0