Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_large_freqs():
if 'TEST_FILE_LOC' in os.environ:
loc = os.environ['TEST_FILE_LOC']
else:
return None
counts = PreshCounter()
for i, line in enumerate(open(loc)):
line = line.strip()
if not line:
continue
freq = int(line.split()[0])
counts.inc(i+1, freq)
oov = i+2
assert counts.prob(oov) == 0.0
assert counts.prob(1) < 0.1
counts.smooth()
assert counts.prob(oov) > 0
assert counts.prob(oov) < counts.prob(i)
def test_count():
counter = PreshCounter()
assert counter[12] == 0
counter.inc(12, 1)
assert counter[12] == 1
counter.inc(14, 10)
counter.inc(9, 10)
counter.inc(12, 4)
assert counter[12] == 5
assert counter[14] == 10
assert counter[9] == 10
def test_resize():
h = PreshMap(4)
h[4] = 12
for i in range(10, 100):
value = int(i * (random.random() + 1))
h[i] = value
assert h[4] == 12
def test_insert():
h = PreshMap()
assert h[1] is None
h[1] = 5
assert h[1] == 5
h[2] = 6
assert h[1] == 5
assert h[2] == 6
def test_iter():
key_sum = 0
val_sum = 0
h = PreshMap()
for i in range(56, 24, -3):
h[i] = i * 2
key_sum += i
val_sum += i * 2
for key, value in h.items():
key_sum -= key
val_sum -= value
assert key_sum == 0
assert val_sum == 0
def test_zero_key():
h = PreshMap()
h[0] = 6
h[5] = 12
assert h[0] == 6
assert h[5] == 12
for i in range(500, 1000):
h[i] = i * random.random()
assert h[0] == 6
assert h[5] == 12
def tree():
return SequenceIndex()
def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
if not loc.exists():
print("Warning: Frequencies file not found")
return {}, 0.0
counts = PreshCounter()
total = 0
if str(loc).endswith('gz'):
file_ = gzip.open(str(loc))
else:
file_ = loc.open()
for i, line in enumerate(file_):
freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq)
counts.inc(i+1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
if str(loc).endswith('gz'):
file_ = gzip.open(str(loc))
else:
file_ = loc.open()
def merge_counts(locs, out_loc):
string_map = StringStore()
counts = PreshCounter()
for loc in locs:
with io.open(loc, 'r', encoding='utf8') as file_:
for line in file_:
freq, word = line.strip().split('\t', 1)
orth = string_map[word]
counts.inc(orth, int(freq))
with io.open(out_loc, 'w', encoding='utf8') as file_:
for orth, count in counts:
string = string_map[orth]
file_.write('%d\t%s\n' % (count, string))
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
counts = PreshCounter()
total = 0
with freqs_loc.open() as f:
for i, line in enumerate(f):
freq, doc_freq, key = line.rstrip().split("\t", 2)
freq = int(freq)
counts.inc(i + 1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
probs = {}
with freqs_loc.open() as f:
for line in tqdm(f):
freq, doc_freq, key = line.rstrip().split("\t", 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: