How to use the pikepdf.PdfMatrix function in pikepdf

To help you get started, we’ve selected a few pikepdf examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pikepdf / pikepdf / tests / test_pages.py View on Github external
def test_page_contents_add(graph, outdir):
    pdf = graph

    mat = PdfMatrix().rotated(45)

    stream1 = Stream(pdf, b'q ' + mat.encode() + b' cm')
    stream2 = Stream(pdf, b'Q')

    pdf.pages[0].page_contents_add(stream1, True)
    pdf.pages[0].page_contents_add(stream2, False)
    pdf.save(outdir / 'out.pdf')
github jbarlow83 / OCRmyPDF / src / ocrmypdf / pdfinfo / info.py View on Github external
image there should be a 'cm' that sets up an image coordinate system
    where drawing from (0, 0) -> (1, 1) will draw on the desired area of the
    page.

    PDF units suit our needs so we initialize ctm to the identity matrix.

    According to the PDF specification, the maximum stack depth is 32. Other
    viewers tolerate some amount beyond this.  We issue a warning if the
    stack depth exceeds the spec limit and set a hard limit beyond this to
    bound our memory requirements.  If the stack underflows behavior is
    undefined in the spec, but we just pretend nothing happened and leave the
    CTM unchanged.
    """

    stack = []
    ctm = PdfMatrix(initial_shorthand)
    xobject_settings = []
    inline_images = []
    found_vector = False
    vector_ops = set('S s f F f* B B* b b*'.split())
    image_ops = set('BI ID EI q Q Do cm'.split())
    operator_whitelist = ' '.join(vector_ops | image_ops)

    for n, graphobj in enumerate(
        _normalize_stack(
            pikepdf.parse_content_stream(contentstream, operator_whitelist)
        )
    ):
        operands, operator = graphobj
        if operator == 'q':
            stack.append(ctm)
            if len(stack) > 32:  # See docstring
github jbarlow83 / OCRmyPDF / src / ocrmypdf / pdfinfo / info.py View on Github external
stack.append(ctm)
            if len(stack) > 32:  # See docstring
                if len(stack) > 128:
                    raise RuntimeError(
                        "PDF graphics stack overflowed hard limit, operator %i" % n
                    )
                warn("PDF graphics stack overflowed spec limit")
        elif operator == 'Q':
            try:
                ctm = stack.pop()
            except IndexError:
                # Keeping the ctm the same seems to be the only sensible thing
                # to do. Just pretend nothing happened, keep calm and carry on.
                warn("PDF graphics stack underflowed - PDF may be malformed")
        elif operator == 'cm':
            ctm = PdfMatrix(operands) @ ctm
        elif operator == 'Do':
            image_name = operands[0]
            settings = XobjectSettings(
                name=image_name, shorthand=ctm.shorthand, stack_depth=len(stack)
            )
            xobject_settings.append(settings)
        elif operator == 'INLINE IMAGE':  # BI/ID/EI are grouped into this
            iimage = operands[0]
            inline = InlineSettings(
                iimage=iimage, shorthand=ctm.shorthand, stack_depth=len(stack)
            )
            inline_images.append(inline)
        elif operator in vector_ops:
            found_vector = True

    return ContentsInfo(
github jbarlow83 / OCRmyPDF / src / ocrmypdf / pdfinfo / info.py View on Github external
on one page it may be drawn at differing resolutions, and our objective
    is to find the resolution at which the page can be rastered without
    downsampling.

    """

    if container.get('/Type') == '/Page' and '/Contents' in container:
        initial_shorthand = shorthand or UNIT_SQUARE
    elif container.get('/Type') == '/XObject' and container['/Subtype'] == '/Form':
        # Set the CTM to the state it was when the "Do" operator was
        # encountered that is drawing this instance of the Form XObject
        ctm = PdfMatrix(shorthand) if shorthand else PdfMatrix.identity()

        # A Form XObject may provide its own matrix to map form space into
        # user space. Get this if one exists
        form_shorthand = container.get('/Matrix', PdfMatrix.identity())
        form_matrix = PdfMatrix(form_shorthand)

        # Concatenate form matrix with CTM to ensure CTM is correct for
        # drawing this instance of the XObject
        ctm = form_matrix @ ctm
        initial_shorthand = ctm.shorthand
    else:
        return

    contentsinfo = _interpret_contents(container, initial_shorthand)

    if contentsinfo.found_vector:
        yield VectorInfo()
    yield from _find_inline_images(contentsinfo)
    yield from _find_regular_images(container, contentsinfo)
    yield from _find_form_xobject_images(pdf, container, contentsinfo)
github jbarlow83 / OCRmyPDF / src / ocrmypdf / pdfinfo / info.py View on Github external
is to find the resolution at which the page can be rastered without
    downsampling.

    """

    if container.get('/Type') == '/Page' and '/Contents' in container:
        initial_shorthand = shorthand or UNIT_SQUARE
    elif container.get('/Type') == '/XObject' and container['/Subtype'] == '/Form':
        # Set the CTM to the state it was when the "Do" operator was
        # encountered that is drawing this instance of the Form XObject
        ctm = PdfMatrix(shorthand) if shorthand else PdfMatrix.identity()

        # A Form XObject may provide its own matrix to map form space into
        # user space. Get this if one exists
        form_shorthand = container.get('/Matrix', PdfMatrix.identity())
        form_matrix = PdfMatrix(form_shorthand)

        # Concatenate form matrix with CTM to ensure CTM is correct for
        # drawing this instance of the XObject
        ctm = form_matrix @ ctm
        initial_shorthand = ctm.shorthand
    else:
        return

    contentsinfo = _interpret_contents(container, initial_shorthand)

    if contentsinfo.found_vector:
        yield VectorInfo()
    yield from _find_inline_images(contentsinfo)
    yield from _find_regular_images(container, contentsinfo)
    yield from _find_form_xobject_images(pdf, container, contentsinfo)
github jbarlow83 / OCRmyPDF / src / ocrmypdf / _graft.py View on Github external
# content may have a rotation applied. Wrap the text stream with a rotation
    # so it will be oriented the same way as the rest of the page content.
    # (Previous versions OCRmyPDF rotated the content layer to match the text.)
    mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)]
    wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

    mediabox = [float(base_page.MediaBox[v]) for v in range(4)]
    wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

    translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2)
    untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2)
    corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1])
    # -rotation because the input is a clockwise angle and this formula
    # uses CCW
    rotation = -rotation % 360
    rotate = pikepdf.PdfMatrix().rotated(rotation)

    # Because of rounding of DPI, we might get a text layer that is not
    # identically sized to the target page. Scale to adjust. Normally this
    # is within 0.998.
    if rotation in (90, 270):
        wt, ht = ht, wt
    scale_x = wp / wt
    scale_y = hp / ht

    # log.debug('%r', scale_x, scale_y)
    scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y)

    # Translate the text so it is centered at (0, 0), rotate it there, adjust
    # for a size different between initial and text PDF, then untranslate, and
    # finally move the lower left corner to match the mediabox
    ctm = translate @ rotate @ scale @ untranslate @ corner
github jbarlow83 / OCRmyPDF / src / ocrmypdf / _graft.py View on Github external
pdf_text_contents = pdf_text.pages[0].Contents.read_bytes()

    base_page = pdf_base.pages.p(page_num)

    # The text page always will be oriented up by this stage but the original
    # content may have a rotation applied. Wrap the text stream with a rotation
    # so it will be oriented the same way as the rest of the page content.
    # (Previous versions OCRmyPDF rotated the content layer to match the text.)
    mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)]
    wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

    mediabox = [float(base_page.MediaBox[v]) for v in range(4)]
    wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

    translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2)
    untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2)
    corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1])
    # -rotation because the input is a clockwise angle and this formula
    # uses CCW
    rotation = -rotation % 360
    rotate = pikepdf.PdfMatrix().rotated(rotation)

    # Because of rounding of DPI, we might get a text layer that is not
    # identically sized to the target page. Scale to adjust. Normally this
    # is within 0.998.
    if rotation in (90, 270):
        wt, ht = ht, wt
    scale_x = wp / wt
    scale_y = hp / ht

    # log.debug('%r', scale_x, scale_y)
    scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y)
github jbarlow83 / OCRmyPDF / src / ocrmypdf / _graft.py View on Github external
corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1])
    # -rotation because the input is a clockwise angle and this formula
    # uses CCW
    rotation = -rotation % 360
    rotate = pikepdf.PdfMatrix().rotated(rotation)

    # Because of rounding of DPI, we might get a text layer that is not
    # identically sized to the target page. Scale to adjust. Normally this
    # is within 0.998.
    if rotation in (90, 270):
        wt, ht = ht, wt
    scale_x = wp / wt
    scale_y = hp / ht

    # log.debug('%r', scale_x, scale_y)
    scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y)

    # Translate the text so it is centered at (0, 0), rotate it there, adjust
    # for a size different between initial and text PDF, then untranslate, and
    # finally move the lower left corner to match the mediabox
    ctm = translate @ rotate @ scale @ untranslate @ corner

    pdf_text_contents = b'q %s cm\n' % ctm.encode() + pdf_text_contents + b'\nQ\n'

    new_text_layer = pikepdf.Stream(pdf_base, pdf_text_contents)

    if strip_old_text:
        strip_invisible_text(pdf_base, base_page)

    base_page.page_contents_add(new_text_layer, prepend=True)

    _update_page_resources(