How to use the pdf2image.pdf2image._page_count function in pdf2image

To help you get started, we’ve selected a few pdf2image examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github srijan14 / Document-Machine-Translation / src / convert.py View on Github external
def extract_text(self):

        PDF_file = self.filename
        out_folder_name = os.path.basename(self.filename)

        if not os.path.exists(self.image_out_path):
            os.mkdir(self.image_out_path)

        if not os.path.exists(os.path.abspath(os.path.join(self.image_out_path,\
                out_folder_name))):
            os.mkdir(os.path.abspath(os.path.join(self.image_out_path,\
                out_folder_name)))


        index=0
        maxPages = pdf2image._page_count(PDF_file)
        for page in range(0, maxPages, 10):
            pages = pdf2image.convert_from_path(PDF_file, dpi=200,
                                              first_page=page,
                              last_page=min(page + 10 - 1, maxPages))
            for tpage in pages:
                tpage.save(os.path.abspath(os.path.join(self.image_out_path,
                                                       out_folder_name ,
                                                       str(index) + ".jpg"
                                                       )),'JPEG')
                index = index + 1

        print("Successfully saved images for each page for {}".format(self.image_out_path))

        english_text = list()

        for filename in sorted(os.listdir(os.path.join(self.image_out_path,