How to use the ocrmypdf.ocr function in ocrmypdf

To help you get started, we’ve selected a few ocrmypdf examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github jbarlow83 / OCRmyPDF / tests / test_page_numbers.py View on Github external
def test_limited_pages(resources, outpdf, spoof_tesseract_cache):
    multi = resources / 'multipage.pdf'
    ocrmypdf.ocr(
        multi,
        outpdf,
        pages='5-6',
        optimize=0,
        output_type='pdf',
        tesseract_env=spoof_tesseract_cache,
    )
    pi = PdfInfo(outpdf)
    assert not pi.pages[0].has_text
    assert pi.pages[4].has_text
    assert pi.pages[5].has_text
github jbarlow83 / OCRmyPDF / tests / test_main.py View on Github external
def test_masks(spoof_tesseract_noop, resources, outpdf):
    assert (
        ocrmypdf.ocr(
            resources / 'masks.pdf', outpdf, tesseract_env=spoof_tesseract_noop
        )
        == ExitCode.ok
    )
github jbarlow83 / OCRmyPDF / tests / test_graft.py View on Github external
def test_no_glyphless_graft(resources, outdir):
    pdf = pikepdf.open(resources / 'francais.pdf')
    pdf_aspect = pikepdf.open(resources / 'aspect.pdf')
    pdf_cmyk = pikepdf.open(resources / 'cmyk.pdf')
    pdf.pages.extend(pdf_aspect.pages)
    pdf.pages.extend(pdf_cmyk.pages)
    pdf.save(outdir / 'test.pdf')

    with patch('ocrmypdf._graft.MAX_REPLACE_PAGES', 2):
        ocrmypdf.ocr(
            outdir / 'test.pdf', outdir / 'out.pdf', deskew=True, tesseract_timeout=0
        )
github jbarlow83 / OCRmyPDF / tests / test_graft.py View on Github external
def test_links(resources, outpdf):
    ocrmypdf.ocr(
        resources / 'link.pdf', outpdf, redo_ocr=True, oversample=200, output_type='pdf'
    )
    pdf = pikepdf.open(outpdf)
    p1 = pdf.pages[0]
    p2 = pdf.pages[1]
    assert p1.Annots[0].A.D[0].objgen == p2.objgen
    assert p2.Annots[0].A.D[0].objgen == p1.objgen
github jbarlow83 / OCRmyPDF / tests / test_filters.py View on Github external
def test_filter_from_api(resources, outdir):
    ocrmypdf.ocr(
        resources / 'crom.png',
        outdir / 'out.pdf',
        image_dpi=100,
        sidecar=outdir / 'sidecar.txt',
        filter_ocr_image=whiteout,
    )
    assert (outdir / 'sidecar.txt').read_text().strip() == ''