Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_registry():
"""Test that custom functions are used internally if they're registered."""
@registry.make_key.register("custom_make_key")
def custom_make_key(word, sense):
return f"{word}###{sense}"
@registry.split_key.register("custom_split_key")
def custom_split_key(key):
return tuple(key.split("###"))
overrides = {"make_key": "custom_make_key", "split_key": "custom_split_key"}
test_vector = numpy.asarray([1, 2, 3, 4], dtype=numpy.float32)
data = [("clear", "NOUN", 100), ("clear", "VERB", 200), ("clear", "ADJ", 300)]
s2v = Sense2Vec(shape=(len(data), 4), overrides=overrides)
for word, sense, freq in data:
s2v.add(custom_make_key(word, sense), test_vector, freq)
s2v.cfg["senses"].append(sense)
assert "clear###NOUN" in s2v
other_senses = s2v.get_other_senses("clear###NOUN")
assert len(other_senses) == 2
assert "clear###VERB" in other_senses
assert "clear###ADJ" in other_senses
assert s2v.get_best_sense("clear") == "clear###ADJ"
def test_sense2vec_most_similar():
s2v = Sense2Vec(shape=(6, 4))
s2v.add("a", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32))
s2v.add("b", numpy.asarray([4, 4, 2, 2], dtype=numpy.float32))
s2v.add("c", numpy.asarray([4, 4, 4, 2], dtype=numpy.float32))
s2v.add("d", numpy.asarray([4, 4, 4, 4], dtype=numpy.float32))
s2v.add("x", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32))
s2v.add("y", numpy.asarray([0.1, 1, 1, 1], dtype=numpy.float32))
result1 = s2v.most_similar(["x"], n=2)
assert len(result1) == 2
assert result1[0][0] == "a"
assert result1[0][1] == 1.0
assert result1[0][1] == pytest.approx(1.0)
assert result1[1][0] == "b"
result2 = s2v.most_similar(["a", "x"], n=2)
assert len(result2) == 2
assert sorted([key for key, _ in result2]) == ["b", "d"]
result3 = s2v.most_similar(["a", "b"], n=3)
def s2v():
data_path = Path(__file__).parent / "data"
return Sense2Vec().from_disk(data_path)
def test_sense2vec_freqs():
s2v = Sense2Vec(shape=(10, 4))
vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)
s2v.add("test1", vector, 123)
s2v.add("test2", vector, 456)
assert len(s2v.freqs) == 2
assert s2v.get_freq("test1") == 123
assert s2v.get_freq("test2") == 456
assert s2v.get_freq("test3") is None
assert s2v.get_freq("test3", 100) == 100
s2v.set_freq("test3", 200)
assert s2v.get_freq("test3") == 200
def test_sense2vec_best_sense():
s2v = Sense2Vec(shape=(5, 4))
s2v.cfg["senses"] = ["A", "B", "C"]
for key, freq in [("a|A", 100), ("a|B", 50), ("a|C", 10), ("b|A", 1), ("B|C", 2)]:
s2v.add(key, numpy.asarray([4, 2, 2, 2], dtype=numpy.float32), freq)
assert s2v.get_best_sense("a") == "a|A"
assert s2v.get_best_sense("b") == "B|C"
assert s2v.get_best_sense("b", ignore_case=False) == "b|A"
assert s2v.get_best_sense("c") is None
s2v.cfg["senses"] = []
assert s2v.get_best_sense("a") is None
assert s2v.get_best_sense("b", ["A"]) == "b|A"
assert s2v.get_best_sense("b", ["A", "C"]) == "B|C"
def test_sense2vec_object():
s2v = Sense2Vec(shape=(10, 4))
assert s2v.vectors.shape == (10, 4)
assert len(s2v) == 10
test_vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)
s2v.add("test", test_vector)
assert "test" in s2v
assert isinstance(s2v.strings["test"], int)
assert s2v.strings["test"] in s2v
assert "foo" not in s2v
assert numpy.array_equal(s2v["test"], test_vector)
assert numpy.array_equal(s2v[s2v.strings["test"]], test_vector)
assert list(s2v.keys()) == ["test"]
s2v.add("test2", test_vector)
assert "test2" in s2v
assert sorted(list(s2v.keys())) == ["test", "test2"]
with pytest.raises(ValueError):
s2v["test3"] = test_vector
def test_sense2vec_to_from_bytes():
s2v = Sense2Vec(shape=(2, 4))
test_vector1 = numpy.asarray([1, 2, 3, 4], dtype=numpy.float32)
test_vector2 = numpy.asarray([5, 6, 7, 8], dtype=numpy.float32)
s2v.add("test1", test_vector1, 123)
s2v.add("test2", test_vector2, 456)
s2v_bytes = s2v.to_bytes()
new_s2v = Sense2Vec().from_bytes(s2v_bytes)
assert len(new_s2v) == 2
assert new_s2v.vectors.shape == (2, 4)
assert "test1" in new_s2v
assert "test2" in new_s2v
assert new_s2v.get_freq("test1") == 123
assert new_s2v.get_freq("test2") == 456
assert numpy.array_equal(new_s2v["test1"], test_vector1)
assert numpy.array_equal(new_s2v["test2"], test_vector2)
assert s2v_bytes == new_s2v.to_bytes()
s2v_bytes2 = s2v.to_bytes(exclude=["strings"])
new_s2v2 = Sense2Vec().from_bytes(s2v_bytes2)
assert len(new_s2v2.strings) == 0
assert "test1" in new_s2v2
assert s2v.strings["test1"] in new_s2v2
with pytest.raises(KeyError): # can't resolve hash
new_s2v2.strings[s2v.strings["test2"]]
senses=None,
exclude_senses=EVAL_EXCLUDE_SENSES,
n_freq=100_000,
n_similar=10,
batch_size=5,
eval_whole=False,
eval_only=False,
show_scores=False,
):
"""
Evaluate a vectors model by looking at the most similar entries it returns
for a random phrase and unselecting the mistakes.
"""
log("RECIPE: Starting recipe sense2vec.eval-most-similar", locals())
random.seed(0)
s2v = Sense2Vec().from_disk(vectors_path)
log("RECIPE: Loaded sense2vec vectors", vectors_path)
seen = set()
DB = connect()
if dataset in DB:
examples = DB.get_dataset(dataset)
seen.update([eg["text"] for eg in examples if eg["answer"] == "accept"])
log(f"RECIPE: Skipping {len(seen)} terms already in dataset")
def get_html(key, score=None, large=False):
word, sense = s2v.split_key(key)
html_word = f"<span style="font-size: {30 if large else 20}px">{word}</span>"
html_sense = f"<strong style="opacity: 0.75; font-size: 14px; padding-left: 10px">{sense}</strong>"
html = f"{html_word} {html_sense}"
if show_scores and score is not None:
html += f" <span style="opacity: 0.75; font-size: 12px; padding-left: 10px">{score:.4}</span>"
return html
eval_whole=False,
eval_only=False,
show_scores=False,
):
"""
Evaluate a sense2vec model by asking about phrase triples: is word A more
similar to word B, or to word C? If the human mostly agrees with the model,
the vectors model is good.
"""
random.seed(0)
log("RECIPE: Starting recipe sense2vec.eval", locals())
strategies = eval_strategies.get_all()
if strategy not in strategies.keys():
err = f"Invalid strategy '{strategy}'. Expected: {list(strategies.keys())}"
msg.fail(err, exits=1)
s2v = Sense2Vec().from_disk(vectors_path)
log("RECIPE: Loaded sense2vec vectors", vectors_path)
def get_html(key, score=None, large=False):
word, sense = s2v.split_key(key)
html_word = f"<span style="font-size: {30 if large else 20}px">{word}</span>"
html_sense = f"<strong style="opacity: 0.75; font-size: 14px; padding-left: 10px">{sense}</strong>"
html = f"{html_word} {html_sense}"
if show_scores and score is not None:
html += f" <span style="opacity: 0.75; font-size: 12px; padding-left: 10px">{score:.4}</span>"
return html
def get_stream():
strategy_func = eval_strategies.get(strategy)
log(f"RECIPE: Using strategy {strategy}")
# Limit to most frequent entries
keys = [key for key, _ in s2v.frequencies[:n_freq]]
w2v_model.train(
sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.iter
)
print("Creating the sense2vec model...")
vectors = []
all_senses = set()
for string in w2v_model.wv.vocab:
vocab = w2v_model.wv.vocab[string]
freq, idx = vocab.count, vocab.index
if freq < min_count:
continue
vector = w2v_model.wv.vectors[idx]
vectors.append((string, freq, vector))
_, sense = split_key(string)
all_senses.add(sense)
s2v = Sense2Vec(shape=(len(vectors), size), senses=all_senses)
for string, freq, vector in vectors:
s2v.add(string, vector, freq)
print("Saving the model...")
s2v.to_disk(out_dir)
print(f"Saved model to directory: {out_dir}")