How to use the normality.guess_encoding function in normality

To help you get started, we’ve selected a few normality examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pudo / normality / tests / test_normality.py View on Github external
def test_guess_encoding(self):
        text = u'Порошенко Петро Олексійович'
        encoded = text.encode('iso-8859-5')
        out = guess_encoding(encoded)
        self.assertEqual('iso-8859-5', out)
github alephdata / aleph / services / ingest-file / ingestors / support / encoding.py View on Github external
def read_file_decoded(self, entity, file_path):
        with open(file_path, 'rb') as fh:
            body = fh.read()

        if not entity.has('encoding'):
            entity.set('encoding', guess_encoding(body))

        for encoding in entity.get('encoding'):
            try:
                body = body.decode(encoding)
                if encoding != self.DEFAULT_ENCODING:
                    log.info("Decoding [%r] as: %s", entity, encoding)
                return body
            except UnicodeDecodeError as ude:
                raise ProcessingException('Error decoding file as %s: %s' %
                                          (encoding, ude)) from ude
github occrp-attic / ingestors / ingestors / email / outlookmsg_lib.py View on Github external
a value if possible.  If there are both ASCII and Unicode
        versions, then the parameter /prefer/ specifies which will be
        returned.
        """

        if isinstance(filename, list):
            # Join with slashes to make it easier to append the type
            filename = "/".join(filename)

        value = windowsUnicode(self._getStream(filename + '001F'))
        if value is None:
            raw = self._getStream(filename + '001E')
            try:
                value = decode_utf7(raw)
            except Exception:
                encoding = guess_encoding(raw)
                value = raw.decode(encoding, 'replace')

        if value is not None and len(value):
            return remove_unsafe_chars(value)
github alephdata / aleph / services / ingest-file / ingestors / email / outlookmsg_lib.py View on Github external
a value if possible.  If there are both ASCII and Unicode
        versions, then the parameter /prefer/ specifies which will be
        returned.
        """

        if isinstance(filename, list):
            # Join with slashes to make it easier to append the type
            filename = "/".join(filename)

        value = windowsUnicode(self._getStream(filename + '001F'))
        if value is None:
            raw = self._getStream(filename + '001E')
            try:
                value = decode_utf7(raw)
            except Exception:
                encoding = guess_encoding(raw)
                value = raw.decode(encoding, 'replace')

        if value is not None and len(value):
            return remove_unsafe_chars(value)
github pudo / pgcsv / pgcsv / csv.py View on Github external
def open_csv(file_path, encoding=None, delimiter=None):
    if encoding is None:
        with io.open(file_path, 'rb') as fh:
            data = fh.read(SAMPLE_SIZE)
            encoding = guess_encoding(data)

    fh = io.open(file_path, 'r', encoding=encoding)
    if delimiter is None:
        data = fh.read(SAMPLE_SIZE)
        dialect = csv.Sniffer().sniff(data)
        delimiter = dialect.delimiter
        fh.seek(0)

    reader = csv.reader(fh, delimiter=delimiter)
    headers = []
    for row in reader:
        headers = row
        break
    fh.seek(0)
    return fh, delimiter, headers
github alephdata / aleph / services / ingest-file / ingestors / support / encoding.py View on Github external
def decode_string(self, text, encoding=DEFAULT_ENCODING):
        if not isinstance(text, bytes):
            return stringify(text)
        encoding = normalize_encoding(encoding)
        try:
            return text.decode(encoding, 'strict')
        except Exception:
            try:
                detected = guess_encoding(text)
                return text.decode(detected, 'strict')
            except Exception:
                return text.decode(encoding, 'replace')