How to use textract - 10 common examples

To help you get started, we’ve selected a few textract examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github danthelion / doc2audiobook / View on Github external
def process_input_file(client, voice, audio_config, input_file: Path, output_directory_path):'Processing input file `{input_file}`')
    output_file = output_directory_path / (input_file.stem + '.mp3')'Target output file is: `{output_file}`')

    text_to_translate = textract.process(str(input_file))

    )'Processing done for input file `{input_file}`')
github adamheins / ligatures / examples / View on Github external
def main():
    # Extract the text from the PDF file.
    text = textract.process('sample.pdf').decode('utf-8').strip()

    # Symbol representing a missing ligature (unicode "unknown" glyph)
    unknown_lig = u'\ufffd'

    # Build the ligature map if it doesn't already exist.
    if not os.path.isdir('data'):
        with open('words.txt') as f:
            words = set(
        lig_map ='data')

    # Load the ligature map from the data directory.
    lig_map = ligatures.load('data')

    # Restore the missing ligatures.
    _, new_text = lig_map.query_text(text, unknown_lig)
github texta-tk / texta / dataset_importer / document_reader / readers / entity / View on Github external
def get_features(**kwargs):
        directory = kwargs['directory']

        for file_path in PDFReader.get_file_list(directory, 'pdf'):
            features = PDFReader.get_meta_features(file_path=file_path)

                features['text'] = textract.process(file_path)
                features['_texta_id'] = file_path

                yield features
github wellcometrust / reach / tools / View on Github external
def convert(filename):
    text = textract.process(filename, encoding='utf-8')
    if not text:
        text = textract.process(filename, encoding='utf-8', method='tesseract')

    return text.decode('utf-8')
github mitmedialab / DataBasic / databasic / logic / View on Github external
def _docx_to_txt(file_path):
    return textract.process(file_path).decode('utf-8')
github fourdigits / wagtail_textract / src / wagtail_textract / View on Github external
def transcribe_document(document):
    """Store the Document file's text in the transcription field."""
        text = textract.process(document.file.path).strip()
        if not text:
            logger.debug('No text found, falling back to tesseract.')
            text = textract.process(

    except Exception as err:
        text = None
            'Text extraction error with file {file}: {message}'.format(

    if text:
        document.transcription = text.decode()
github adamkhazi / information-extraction-system / View on Github external
def read_resume_content_txtract(self):
        self.logger.println("extracting resume content using textract")
        self.resume_content = []
        # idxs of files that don't have content
        remove_files_idxs = []
        for idx, filename in enumerate(self.dataset_filenames):
            self.logger.println("extracting from resume %s/%s using txtract" % (idx+1, len(self.dataset_filenames)) )
            # append filename + ext to path
            filepath = self.__dataset_raw_data_folder + self.__file_path_seperator + filename[0] + filename[1]
            extracted_str = ""
                extracted_bytes = textract.process(filepath, encoding="utf_8")
                extracted_str = extracted_bytes.decode("utf-8")
                self.logger.println("txtract threw an error")
        deleted_idxs = 0
        for idx in remove_files_idxs:
            self.logger.println("removing unprocessed resume file at index %s named %s" % (idx, self.dataset_filenames[idx]))
            del self.dataset_filenames[idx-deleted_idxs]

        self.logger.println("read content from %s resume files" % len(self.resume_content))
github datamade / django-councilmatic / councilmatic_core / management / commands / View on Github external
document_id = document_data['id']
            response = requests.get(url)
            # Sometimes, Metro Legistar has a URL that retuns a bad status code (e.g., 404 from 
            # Skip these documents.
            if response.status_code != 200:
                logger.error('Document URL {} returns {} - Could not get attachment text!'.format(url, response.status_code))

            extension = os.path.splitext(url)[1]

            with tempfile.NamedTemporaryFile(suffix=extension) as tfp:

                    plain_text = textract.process(
                except textract.exceptions.ShellError as e:
                    logger.error('{} - Could not convert Document ID {}!'.format(e, document_id))

      'Document ID {} - conversion complete'.format(document_id))

            yield {'plain_text': plain_text.decode('utf-8'), 'id': document_id}
github BurgosNY / CamaraCrawler / View on Github external
def convert(self):
        """Converte o PDF baixado anteriormente.

        Faz download de um arquivo PDF, executa o 'parser' do BeautifulSoup
        e transforma o mesmo em uma 'string' utilizando o textract:

        import textract
        source_file =
            source_binary = textract.process(self.filename, encoding='utf_8',
                                             method='pdftotext', layout=True)
            soup = BeautifulSoup(source_binary, "html.parser")
            text_string = soup.prettify(formatter=None)
        except textract.exceptions.ShellError:
            # TODO: implementar uma maneira de lidar com arquivos nao PDF.
            print('Not a pdf')
            raise NameError('The file is not a .pdf')

        # Apaga o arquivo baixado caso não esteja explícito para salvar
        if not
        return text_string
github BitCurator / bitcurator-access-webtools / bcaw / View on Github external
    def get_path_details(cls, temp_path, image_path):
        """Return the byte sequence and the full text for a given path."""
        byte_sequence = ByteSequence.from_path(temp_path)
        extension = map_mime_to_ext(byte_sequence.mime_type, cls.mime_map)
        logging.debug("Assessing MIME: %s EXTENSION %s SHA1:%s", byte_sequence.mime_type,
                      extension, byte_sequence.sha1)
        full_text = ""
        if extension is not None:
                logging.debug("Textract for SHA1 %s, extension map val %s",
                              byte_sequence.sha1, extension)
                full_text = process(temp_path, extension=extension, encoding='ascii',
            except ExtensionNotSupported as _:
                logging.exception("Textract extension not supported for ext %s", extension)
                logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
                full_text = "N/A"
            except LookupError as _:
                logging.exception("Lookup error for encoding.")
                logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
                full_text = "N/A"
            except UnicodeDecodeError as _:
                logging.exception("UnicodeDecodeError, problem with file encoding")
                logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
                full_text = "N/A"
                logging.exception("Textract UNEXPECTEDLY failed for temp_file.")
                logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
                full_text = "N/A"
        return byte_sequence, full_text


extract text from any document. no muss. no fuss.

Latest version published 1 year ago

Package Health Score

65 / 100
Full package analysis