How to use the pikepdf.parse_content_stream function in pikepdf

To help you get started, we’ve selected a few pikepdf examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pikepdf / pikepdf / tests / test_parsers.py View on Github external
def test_invalid_stream_object():
    with pytest.raises(TypeError):
        parse_content_stream(Dictionary({"/Hi": 3}))
github pikepdf / pikepdf / tests / test_parsers.py View on Github external
def test_text_filter(resources, outdir):
    input_pdf = resources / 'veraPDF test suite 6-2-10-t02-pass-a.pdf'

    # Ensure the test PDF has detect we can find
    proc = run(
        ['pdftotext', str(input_pdf), '-'], check=True, stdout=PIPE, encoding='utf-8'
    )
    assert proc.stdout.strip() != '', "Need input test file that contains text"

    pdf = Pdf.open(input_pdf)
    page = pdf.pages[0]

    keep = []
    for operands, command in parse_content_stream(
        page, """TJ Tj ' " BT ET Td TD Tm T* Tc Tw Tz TL Tf Tr Ts"""
    ):
        if command == Operator('Tj'):
            print("skipping Tj")
            continue
        keep.append((operands, command))

    new_stream = Stream(pdf, keep)
    print(new_stream.read_bytes())  # pylint: disable=no-member
    page['/Contents'] = new_stream
    page['/Rotate'] = 90

    pdf.save(outdir / 'notext.pdf', True)

    proc = run(
        ['pdftotext', str(outdir / 'notext.pdf'), '-'],
github pikepdf / pikepdf / tests / test_image_access.py View on Github external
def inline(resources):
    pdf = Pdf.open(resources / 'image-mono-inline.pdf')
    for operands, _command in parse_content_stream(pdf.pages[0]):
        if operands and isinstance(operands[0], PdfInlineImage):
            return operands[0], pdf
github jbarlow83 / OCRmyPDF / src / ocrmypdf / pdfinfo / info.py View on Github external
undefined in the spec, but we just pretend nothing happened and leave the
    CTM unchanged.
    """

    stack = []
    ctm = PdfMatrix(initial_shorthand)
    xobject_settings = []
    inline_images = []
    found_vector = False
    vector_ops = set('S s f F f* B B* b b*'.split())
    image_ops = set('BI ID EI q Q Do cm'.split())
    operator_whitelist = ' '.join(vector_ops | image_ops)

    for n, graphobj in enumerate(
        _normalize_stack(
            pikepdf.parse_content_stream(contentstream, operator_whitelist)
        )
    ):
        operands, operator = graphobj
        if operator == 'q':
            stack.append(ctm)
            if len(stack) > 32:  # See docstring
                if len(stack) > 128:
                    raise RuntimeError(
                        "PDF graphics stack overflowed hard limit, operator %i" % n
                    )
                warn("PDF graphics stack overflowed spec limit")
        elif operator == 'Q':
            try:
                ctm = stack.pop()
            except IndexError:
                # Keeping the ctm the same seems to be the only sensible thing
github jbarlow83 / OCRmyPDF / src / ocrmypdf / _graft.py View on Github external
def strip_invisible_text(pdf, page):
    stream = []
    in_text_obj = False
    render_mode = 0
    text_objects = []

    page.page_contents_coalesce()
    for operands, operator in pikepdf.parse_content_stream(page, ''):
        if not in_text_obj:
            if operator == pikepdf.Operator('BT'):
                in_text_obj = True
                render_mode = 0
                text_objects.append((operands, operator))
            else:
                stream.append((operands, operator))
        else:
            if operator == pikepdf.Operator('Tr'):
                render_mode = operands[0]
            text_objects.append((operands, operator))
            if operator == pikepdf.Operator('ET'):
                in_text_obj = False
                if render_mode != 3:
                    stream.extend(text_objects)
                text_objects.clear()