Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
probs = self._make_probs(data)
start, end = self._get_range(data=data, probs=probs)
output_fraction = Fraction(0, 1)
output_denominator = 1
while not (start <= output_fraction < end):
output_numerator = 1 + ((start.numerator * output_denominator) // start.denominator)
output_fraction = Fraction(output_numerator, output_denominator)
output_denominator *= 2
return output_fraction
def _get_size(self, data):
numerator = self._compress(data).numerator
return math.ceil(math.log(numerator, self.base))
class RLENCD(_NCDBase):
"""Run-length encoding
https://en.wikipedia.org/wiki/Run-length_encoding
"""
def _compress(self, data):
new_data = []
for k, g in groupby(data):
n = len(list(g))
if n > 2:
new_data.append(str(n) + k)
elif n == 1:
new_data.append(k)
else:
new_data.append(2 * k)
return ''.join(new_data)
class _BinaryNCDBase(_NCDBase):
def __init__(self):
pass
def __call__(self, *sequences):
if not sequences:
return 0
if isinstance(sequences[0], string_types):
sequences = [s.encode('utf-8') for s in sequences]
return super(_BinaryNCDBase, self).__call__(*sequences)
class ArithNCD(_NCDBase):
"""Arithmetic coding
https://github.com/gw-c/arith
http://www.drdobbs.com/cpp/data-compression-with-arithmetic-encodin/240169251
https://en.wikipedia.org/wiki/Arithmetic_coding
"""
def __init__(self, base=2, terminator=None, qval=1):
self.base = base
self.terminator = terminator
self.qval = qval
def _make_probs(self, *sequences):
"""
https://github.com/gw-c/arith/blob/master/arith.py
"""
"""Square Root based NCD
Size of compressed data equals to sum of square roots of counts of every
element in the input sequence.
"""
def __init__(self, qval=1):
self.qval = qval
def _compress(self, data):
return {element: math.sqrt(count) for element, count in Counter(data).items()}
def _get_size(self, data):
return sum(self._compress(data).values())
class EntropyNCD(_NCDBase):
"""Entropy based NCD
Get Entropy of input secueance as a size of compressed data.
https://en.wikipedia.org/wiki/Entropy_(information_theory)
https://en.wikipedia.org/wiki/Entropy_encoding
"""
def __init__(self, qval=1, coef=1, base=2):
self.qval = qval
self.coef = coef
self.base = base
def _compress(self, data):
total_count = len(data)
entropy = 0.0
for element_count in Counter(data).values():
self.terminator = terminator
def _compress(self, data):
if not data:
data = self.terminator
elif self.terminator not in data:
data += self.terminator
modified = sorted(data[i:] + data[:i] for i in range(len(data)))
data = ''.join([subdata[-1] for subdata in modified])
return super(BWTRLENCD, self)._compress(data)
# -- NORMAL COMPRESSORS -- #
class SqrtNCD(_NCDBase):
"""Square Root based NCD
Size of compressed data equals to sum of square roots of counts of every
element in the input sequence.
"""
def __init__(self, qval=1):
self.qval = qval
def _compress(self, data):
return {element: math.sqrt(count) for element, count in Counter(data).items()}
def _get_size(self, data):
return sum(self._compress(data).values())
class EntropyNCD(_NCDBase):
sequences = self._get_sequences(*sequences)
concat_length = float('Inf')
empty = type(sequences[0])()
for data in permutations(sequences):
if isinstance(empty, (str, bytes)):
data = empty.join(data)
else:
data = sum(data, empty)
concat_length = min(concat_length, self._get_size(data))
compressed_lengths = [self._get_size(s) for s in sequences]
return float(concat_length - min(compressed_lengths)) / max(compressed_lengths)
class _BinaryNCDBase(_NCDBase):
def __init__(self):
pass
def __call__(self, *sequences):
if not sequences:
return 0
if isinstance(sequences[0], string_types):
sequences = [s.encode('utf-8') for s in sequences]
return super(_BinaryNCDBase, self).__call__(*sequences)
class ArithNCD(_NCDBase):
"""Arithmetic coding
https://github.com/gw-c/arith