How to use the textdistance.algorithms.compression_based._NCDBase function in textdistance

To help you get started, we’ve selected a few textdistance examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github life4 / textdistance / textdistance / algorithms / compression_based.py View on Github external
probs = self._make_probs(data)
        start, end = self._get_range(data=data, probs=probs)
        output_fraction = Fraction(0, 1)
        output_denominator = 1
        while not (start <= output_fraction < end):
            output_numerator = 1 + ((start.numerator * output_denominator) // start.denominator)
            output_fraction = Fraction(output_numerator, output_denominator)
            output_denominator *= 2
        return output_fraction

    def _get_size(self, data):
        numerator = self._compress(data).numerator
        return math.ceil(math.log(numerator, self.base))


class RLENCD(_NCDBase):
    """Run-length encoding

    https://en.wikipedia.org/wiki/Run-length_encoding
    """

    def _compress(self, data):
        new_data = []
        for k, g in groupby(data):
            n = len(list(g))
            if n > 2:
                new_data.append(str(n) + k)
            elif n == 1:
                new_data.append(k)
            else:
                new_data.append(2 * k)
        return ''.join(new_data)
github life4 / textdistance / textdistance / algorithms / compression_based.py View on Github external
class _BinaryNCDBase(_NCDBase):

    def __init__(self):
        pass

    def __call__(self, *sequences):
        if not sequences:
            return 0
        if isinstance(sequences[0], string_types):
            sequences = [s.encode('utf-8') for s in sequences]
        return super(_BinaryNCDBase, self).__call__(*sequences)


class ArithNCD(_NCDBase):
    """Arithmetic coding

    https://github.com/gw-c/arith
    http://www.drdobbs.com/cpp/data-compression-with-arithmetic-encodin/240169251
    https://en.wikipedia.org/wiki/Arithmetic_coding
    """

    def __init__(self, base=2, terminator=None, qval=1):
        self.base = base
        self.terminator = terminator
        self.qval = qval

    def _make_probs(self, *sequences):
        """
        https://github.com/gw-c/arith/blob/master/arith.py
        """
github life4 / textdistance / textdistance / algorithms / compression_based.py View on Github external
"""Square Root based NCD

    Size of compressed data equals to sum of square roots of counts of every
    element in the input sequence.
    """
    def __init__(self, qval=1):
        self.qval = qval

    def _compress(self, data):
        return {element: math.sqrt(count) for element, count in Counter(data).items()}

    def _get_size(self, data):
        return sum(self._compress(data).values())


class EntropyNCD(_NCDBase):
    """Entropy based NCD

    Get Entropy of input secueance as a size of compressed data.

    https://en.wikipedia.org/wiki/Entropy_(information_theory)
    https://en.wikipedia.org/wiki/Entropy_encoding
    """
    def __init__(self, qval=1, coef=1, base=2):
        self.qval = qval
        self.coef = coef
        self.base = base

    def _compress(self, data):
        total_count = len(data)
        entropy = 0.0
        for element_count in Counter(data).values():
github life4 / textdistance / textdistance / algorithms / compression_based.py View on Github external
self.terminator = terminator

    def _compress(self, data):
        if not data:
            data = self.terminator
        elif self.terminator not in data:
            data += self.terminator
            modified = sorted(data[i:] + data[:i] for i in range(len(data)))
            data = ''.join([subdata[-1] for subdata in modified])
        return super(BWTRLENCD, self)._compress(data)


# -- NORMAL COMPRESSORS -- #


class SqrtNCD(_NCDBase):
    """Square Root based NCD

    Size of compressed data equals to sum of square roots of counts of every
    element in the input sequence.
    """
    def __init__(self, qval=1):
        self.qval = qval

    def _compress(self, data):
        return {element: math.sqrt(count) for element, count in Counter(data).items()}

    def _get_size(self, data):
        return sum(self._compress(data).values())


class EntropyNCD(_NCDBase):
github life4 / textdistance / textdistance / algorithms / compression_based.py View on Github external
sequences = self._get_sequences(*sequences)

        concat_length = float('Inf')
        empty = type(sequences[0])()
        for data in permutations(sequences):
            if isinstance(empty, (str, bytes)):
                data = empty.join(data)
            else:
                data = sum(data, empty)
            concat_length = min(concat_length, self._get_size(data))

        compressed_lengths = [self._get_size(s) for s in sequences]
        return float(concat_length - min(compressed_lengths)) / max(compressed_lengths)


class _BinaryNCDBase(_NCDBase):

    def __init__(self):
        pass

    def __call__(self, *sequences):
        if not sequences:
            return 0
        if isinstance(sequences[0], string_types):
            sequences = [s.encode('utf-8') for s in sequences]
        return super(_BinaryNCDBase, self).__call__(*sequences)


class ArithNCD(_NCDBase):
    """Arithmetic coding

    https://github.com/gw-c/arith