How to use textract - 10 common examples

To help you get started, we’ve selected a few textract examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github danthelion / doc2audiobook / doc2audiobook.py View on Github external
def process_input_file(client, voice, audio_config, input_file: Path, output_directory_path):
    logger.info(f'Processing input file `{input_file}`')
    output_file = output_directory_path / (input_file.stem + '.mp3')
    logger.info(f'Target output file is: `{output_file}`')

    text_to_translate = textract.process(str(input_file))

    text_to_mp3(
        client=client,
        voice=voice,
        audio_config=audio_config,
        text=text_to_translate,
        output_file_path=output_file
    )

    logger.info(f'Processing done for input file `{input_file}`')
github adamheins / ligatures / examples / parse_pdf.py View on Github external
def main():
    # Extract the text from the PDF file.
    text = textract.process('sample.pdf').decode('utf-8').strip()

    # Symbol representing a missing ligature (unicode "unknown" glyph)
    unknown_lig = u'\ufffd'

    # Build the ligature map if it doesn't already exist.
    if not os.path.isdir('data'):
        with open('words.txt') as f:
            words = set(f.read().splitlines())
        lig_map = ligatures.build(words)
        lig_map.save('data')

    # Load the ligature map from the data directory.
    lig_map = ligatures.load('data')

    # Restore the missing ligatures.
    _, new_text = lig_map.query_text(text, unknown_lig)
github texta-tk / texta / dataset_importer / document_reader / readers / entity / pdf_reader.py View on Github external
def get_features(**kwargs):
        directory = kwargs['directory']

        for file_path in PDFReader.get_file_list(directory, 'pdf'):
            features = PDFReader.get_meta_features(file_path=file_path)

            try:
                features['text'] = textract.process(file_path)
                features['_texta_id'] = file_path

                yield features
            except:
                continue
github wellcometrust / reach / tools / extraction.py View on Github external
def convert(filename):
    text = textract.process(filename, encoding='utf-8')
    if not text:
        text = textract.process(filename, encoding='utf-8', method='tesseract')

    return text.decode('utf-8')
github mitmedialab / DataBasic / databasic / logic / filehandler.py View on Github external
def _docx_to_txt(file_path):
    return textract.process(file_path).decode('utf-8')
github fourdigits / wagtail_textract / src / wagtail_textract / handlers.py View on Github external
def transcribe_document(document):
    """Store the Document file's text in the transcription field."""
    try:
        text = textract.process(document.file.path).strip()
        if not text:
            logger.debug('No text found, falling back to tesseract.')
            text = textract.process(
                document.file.path,
                method='tesseract',
            ).strip()

    except Exception as err:
        text = None
        logger.error(
            'Text extraction error with file {file}: {message}'.format(
                file=document.filename,
                message=str(err),
            )
        )

    if text:
        document.transcription = text.decode()
        document.save(transcribe=False)
github adamkhazi / information-extraction-system / extractor.py View on Github external
def read_resume_content_txtract(self):
        self.logger.println("extracting resume content using textract")
        self.resume_content = []
        # idxs of files that don't have content
        remove_files_idxs = []
        for idx, filename in enumerate(self.dataset_filenames):
            self.logger.println("extracting from resume %s/%s using txtract" % (idx+1, len(self.dataset_filenames)) )
            # append filename + ext to path
            filepath = self.__dataset_raw_data_folder + self.__file_path_seperator + filename[0] + filename[1]
            extracted_str = ""
            try:
                extracted_bytes = textract.process(filepath, encoding="utf_8")
                extracted_str = extracted_bytes.decode("utf-8")
                self.resume_content.append(extracted_str)
            except:
                self.logger.println("txtract threw an error")
                remove_files_idxs.append(idx)
        deleted_idxs = 0
        for idx in remove_files_idxs:
            self.logger.println("removing unprocessed resume file at index %s named %s" % (idx, self.dataset_filenames[idx]))
            del self.dataset_filenames[idx-deleted_idxs]

        self.logger.println("read content from %s resume files" % len(self.resume_content))
github datamade / django-councilmatic / councilmatic_core / management / commands / convert_attachment_text.py View on Github external
document_id = document_data['id']
            response = requests.get(url)
            # Sometimes, Metro Legistar has a URL that retuns a bad status code (e.g., 404 from http://metro.legistar1.com/metro/attachments/95d5007e-720b-4cdd-9494-c800392b9265.pdf). 
            # Skip these documents.
            if response.status_code != 200:
                logger.error('Document URL {} returns {} - Could not get attachment text!'.format(url, response.status_code))
                continue

            extension = os.path.splitext(url)[1]

            with tempfile.NamedTemporaryFile(suffix=extension) as tfp:
                tfp.write(response.content)

                try:
                    plain_text = textract.process(tfp.name)
                except textract.exceptions.ShellError as e:
                    logger.error('{} - Could not convert Document ID {}!'.format(e, document_id))
                    continue

                logger.info('Document ID {} - conversion complete'.format(document_id))

            yield {'plain_text': plain_text.decode('utf-8'), 'id': document_id}
github BurgosNY / CamaraCrawler / pdf_parser.py View on Github external
def convert(self):
        """Converte o PDF baixado anteriormente.

        Faz download de um arquivo PDF, executa o 'parser' do BeautifulSoup
        e transforma o mesmo em uma 'string' utilizando o textract:
        http://textract.readthedocs.io/en/stable/
        """

        import textract
        source_file = self.download()
        try:
            source_binary = textract.process(self.filename, encoding='utf_8',
                                             method='pdftotext', layout=True)
            soup = BeautifulSoup(source_binary, "html.parser")
            text_string = soup.prettify(formatter=None)
        except textract.exceptions.ShellError:
            # TODO: implementar uma maneira de lidar com arquivos nao PDF.
            print('Not a pdf')
            raise NameError('The file is not a .pdf')

        # Apaga o arquivo baixado caso não esteja explícito para salvar
        if not self.save:
            os.remove(self.filename)
        return text_string
github BitCurator / bitcurator-access-webtools / bcaw / text_indexer.py View on Github external
    @classmethod
    def get_path_details(cls, temp_path, image_path):
        """Return the byte sequence and the full text for a given path."""
        byte_sequence = ByteSequence.from_path(temp_path)
        extension = map_mime_to_ext(byte_sequence.mime_type, cls.mime_map)
        logging.debug("Assessing MIME: %s EXTENSION %s SHA1:%s", byte_sequence.mime_type,
                      extension, byte_sequence.sha1)
        full_text = ""
        if extension is not None:
            try:
                logging.debug("Textract for SHA1 %s, extension map val %s",
                              byte_sequence.sha1, extension)
                full_text = process(temp_path, extension=extension, encoding='ascii',
                                    preserveLineBreaks=True)
            except ExtensionNotSupported as _:
                logging.exception("Textract extension not supported for ext %s", extension)
                logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
                full_text = "N/A"
            except LookupError as _:
                logging.exception("Lookup error for encoding.")
                logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
                full_text = "N/A"
            except UnicodeDecodeError as _:
                logging.exception("UnicodeDecodeError, problem with file encoding")
                logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
                full_text = "N/A"
            except:
                logging.exception("Textract UNEXPECTEDLY failed for temp_file.")
                logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
                full_text = "N/A"
        return byte_sequence, full_text

textract

extract text from any document. no muss. no fuss.

MIT
Latest version published 2 years ago

Package Health Score

59 / 100
Full package analysis