How to use the paperwork-backend.paperwork_backend.pdf.doc.PdfDoc function in paperwork-backend

To help you get started, we’ve selected a few paperwork-backend examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openpaperwork / paperwork / paperwork-backend / paperwork_backend / docimport.py View on Github external
logger.info("Importing PDF from '%s'" % (file_uri))
            idx = 0

            for child in self.fs.recurse(file_uri):
                gc.collect()
                if not self.check_file_type(child):
                    continue
                h = PdfDoc.hash_file(self.fs, child)
                if docsearch.is_hash_in_index(h):
                    logger.info(
                        "Document %s already found in the index. Skipped",
                        child
                    )
                    continue
                imported.append(child)
                doc = PdfDoc(self.fs, docsearch.rootdir)
                error = doc.import_pdf(child)
                if error:
                    continue
                docs.append(doc)
                pages += [p for p in doc.pages]
                idx += 1
        return ImportResult(
            imported_file_uris=imported,
            select_doc=doc, new_docs=docs,
            new_docs_pages=pages,
            stats={
                _("PDF"): len(docs),
                _("Document(s)"): len(docs),
                _("Page(s)"): sum([d.nb_pages for d in docs]),
            },
github openpaperwork / paperwork / paperwork-backend / paperwork_backend / index.py View on Github external
from .util import strip_accents


logger = logging.getLogger(__name__)


COMMAND = collections.namedtuple(
    "COMMAND", ["func", "args", "kwargs"]
)
RESULT = collections.namedtuple(
    "RESULT", ["exc", "ret"]
)


DOC_TYPE_LIST = [
    (is_pdf_doc, PdfDoc.doctype, PdfDoc),
    (is_img_doc, ImgDoc.doctype, ImgDoc)
]


class PaperworkIndex(object):
    WHOOSH_SCHEMA = whoosh.fields.Schema(
        # static up to date schema
        docid=whoosh.fields.ID(stored=True, unique=True),
        doctype=whoosh.fields.ID(stored=True, unique=False),
        docfilehash=whoosh.fields.ID(stored=True),
        content=whoosh.fields.TEXT(spelling=True, stored=True),
        label=whoosh.fields.KEYWORD(stored=True, commas=True,
                                    scorable=True),
        date=whoosh.fields.DATETIME(stored=True),
        last_read=whoosh.fields.DATETIME(stored=True),
    )
github openpaperwork / paperwork / paperwork-backend / paperwork_backend / docimport.py View on Github external
doc = None
        docs = []
        pages = []

        file_uris = [self.fs.safe(uri) for uri in file_uris]
        imported = []
        for file_uri in file_uris:
            logger.info("Importing PDF from '%s'" % (file_uri))
            idx = 0

            for child in self.fs.recurse(file_uri):
                gc.collect()
                if not self.check_file_type(child):
                    continue
                h = PdfDoc.hash_file(self.fs, child)
                if docsearch.is_hash_in_index(h):
                    logger.info(
                        "Document %s already found in the index. Skipped",
                        child
                    )
                    continue
                imported.append(child)
                doc = PdfDoc(self.fs, docsearch.rootdir)
                error = doc.import_pdf(child)
                if error:
                    continue
                docs.append(doc)
                pages += [p for p in doc.pages]
                idx += 1
        return ImportResult(
            imported_file_uris=imported,