Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_make_split_key(word, sense, expected):
assert make_key(word, sense) == expected
assert split_key(expected) == (word, sense)
msg.fail("Can't find vocab file", vocab_file, exits=1)
if not output_path.exists():
output_path.mkdir(parents=True)
msg.good(f"Created output directory {out_dir}")
with input_path.open("r", encoding="utf8") as f:
(n_vectors, vector_size), f = _get_shape(f)
vectors_data = f.readlines()
with vocab_path.open("r", encoding="utf8") as f:
vocab = read_vocab(f)
vectors = {}
all_senses = set()
for item in vectors_data:
item = item.rstrip().rsplit(" ", vector_size)
key = item[0]
try:
_, sense = split_key(key)
except ValueError:
continue
vec = item[1:]
if len(vec) != vector_size:
msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1)
all_senses.add(sense)
vectors[key] = numpy.asarray(vec, dtype=numpy.float32)
discarded = set()
discarded.update(get_minority_keys(vocab, min_freq_ratio))
discarded.update(get_redundant_keys(vocab, vectors, min_distance))
n_vectors = len(vectors) - len(discarded)
s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses)
for key, vector in vectors.items():
if key not in discarded:
s2v.add(key, vector)
s2v.set_freq(key, vocab[key])
def get_redundant_keys(vocab, vectors, min_distance):
if min_distance <= 0.0:
return []
by_word = defaultdict(list)
for key, freq in vocab.items():
try:
term, sense = split_key(key)
except ValueError:
continue
term = term.split("_")[-1]
by_word[term.lower()].append((freq, key))
too_similar = []
for values in by_word.values():
if len(values) >= 2:
values.sort(reverse=True)
freq1, key1 = values[0]
vector1 = vectors[key1]
for freq2, key2 in values[1:]:
vector2 = vectors[key2]
sim = cosine_similarity(vector1, vector2)
if sim >= (1 - min_distance):
too_similar.append(key2)
return too_similar
w2v_model.build_vocab(sentences)
print("Training the model...")
w2v_model.train(
sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.iter
)
print("Creating the sense2vec model...")
vectors = []
all_senses = set()
for string in w2v_model.wv.vocab:
vocab = w2v_model.wv.vocab[string]
freq, idx = vocab.count, vocab.index
if freq < min_count:
continue
vector = w2v_model.wv.vectors[idx]
vectors.append((string, freq, vector))
_, sense = split_key(string)
all_senses.add(sense)
s2v = Sense2Vec(shape=(len(vectors), size), senses=all_senses)
for string, freq, vector in vectors:
s2v.add(string, vector, freq)
print("Saving the model...")
s2v.to_disk(out_dir)
print(f"Saved model to directory: {out_dir}")
def get_minority_keys(freqs, min_ratio):
"""Remove keys that are too infrequent relative to a main sense."""
by_word = defaultdict(list)
for key, freq in freqs.items():
try:
term, sense = split_key(key)
except ValueError:
continue
if freq:
by_word[term.lower()].append((freq, key))
discarded = []
for values in by_word.values():
if len(values) >= 2:
values.sort(reverse=True)
freq1, key1 = values[0]
for freq2, key2 in values[1:]:
ratio = freq2 / freq1
if ratio < min_ratio:
discarded.append(key2)
return discarded