How to use the ftfy.chardata.possible_encoding function in ftfy

To help you get started, we’ve selected a few ftfy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LuminosoInsight / python-ftfy / ftfy / streamtester / __init__.py View on Github external
def check_ftfy(self, text, encoding_only=True):
        """
        Given a single text input, check whether `ftfy.fix_text_encoding`
        would change it. If so, display the change.
        """
        self.count += 1
        text = unescape_html(text)
        if not possible_encoding(text, 'ascii'):
            if encoding_only:
                fixed = fix_encoding(text)
            else:
                fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False)
            if text != fixed:
                # possibly filter common bots before printing
                print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format(
                    text=text, fixed=fixed
                ))
                self.num_fixed += 1
            elif 'â€' in text or '\x80' in text:
                print('\nNot fixed:\t{text!r}'.format(text=text))

        # Print status updates once in a while
        if self.count % 100 == 0:
            print('.', end='', flush=True)
github LuminosoInsight / python-ftfy / ftfy / fixes.py View on Github external
def fix_text_and_explain(text):
    """
    Performs a single step of re-encoding text that's been decoded incorrectly.
    It returns the decoded text, plus a structure explaining what it did.

    This structure could be used for more than it currently is, but we at least
    use it to track whether we had to intepret text as an old encoding such as
    MacRoman or cp437.
    """
    if isinstance(text, bytes):
        raise UnicodeError(BYTES_ERROR_TEXT)
    if len(text) == 0:
        return text, []

    # The first plan is to return ASCII text unchanged.
    if possible_encoding(text, 'ascii'):
        return text, []

    # As we go through the next step, remember the possible encodings
    # that we encounter but don't successfully fix yet. We may need them
    # later.
    possible_1byte_encodings = []

    # Suppose the text was supposed to be UTF-8, but it was decoded using
    # a single-byte encoding instead. When these cases can be fixed, they
    # are usually the correct thing to do, so try them next.
    for encoding in FIXABLE_CHARMAP_ENCODINGS:
        if possible_encoding(text, encoding):
            print('possible encoding: %s' % encoding)
            # This is an ugly-looking way to get the bytes that represent
            # the text in this encoding. The reason we can't necessarily
            # use .encode(encoding) is that the decoder is very likely
github LuminosoInsight / python-ftfy / ftfy / autodecode / build_classifier.py View on Github external
def learn_matrix(datafile):
    matrix = np.ones((1 << 23, N), np.float32, order='F')
    count = 0
    for line in codecs.open(datafile, encoding='utf-8'):
        count += 1
        if count % 1000 == 0:
            print(count)
        if possible_encoding(line, 'ascii'):
            continue

        for i, encoding in enumerate(ENCODINGS):
            try:
                linebytes = line.encode(encoding)
                for pos in range(1, len(linebytes) - 1):
                    if linebytes[pos] >= 0x80:
                        trigram = linebytes[pos-1:pos+2]
                        assert len(trigram) == 3
                        row = trigram_to_row(trigram)
                        matrix[row, i] += 1
            except UnicodeEncodeError:
                pass

    norms = np.sum(matrix * matrix, axis=1)[:, np.newaxis]
    return matrix / norms
github LuminosoInsight / python-ftfy / ftfy / fixes.py View on Github external
return text, []

    # The first plan is to return ASCII text unchanged.
    if possible_encoding(text, 'ascii'):
        return text, []

    # As we go through the next step, remember the possible encodings
    # that we encounter but don't successfully fix yet. We may need them
    # later.
    possible_1byte_encodings = []

    # Suppose the text was supposed to be UTF-8, but it was decoded using
    # a single-byte encoding instead. When these cases can be fixed, they
    # are usually the correct thing to do, so try them next.
    for encoding in FIXABLE_CHARMAP_ENCODINGS:
        if possible_encoding(text, encoding):
            print('possible encoding: %s' % encoding)
            # This is an ugly-looking way to get the bytes that represent
            # the text in this encoding. The reason we can't necessarily
            # use .encode(encoding) is that the decoder is very likely
            # to have been sloppier than Python.
            #
            # The decoder might have left bytes unchanged when they're not
            # part of the encoding. It might represent b'\x81' as u'\x81'
            # in Windows-1252, while Python would claim that using byte
            # 0x81 in Windows-1252 is an error.
            #
            # So what we do here is we use the .translate method of Unicode
            # strings. Using it with the character maps we have computed will
            # give us back a Unicode string using only code
            # points up to 0xff. This can then be converted into the intended
            # bytes by encoding it as Latin-1.