How to use the sense2vec.Sense2Vec function in sense2vec

To help you get started, we’ve selected a few sense2vec examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github explosion / sense2vec / tests / test_sense2vec.py View on Github external
def test_registry():
    """Test that custom functions are used internally if they're registered."""

    @registry.make_key.register("custom_make_key")
    def custom_make_key(word, sense):
        return f"{word}###{sense}"

    @registry.split_key.register("custom_split_key")
    def custom_split_key(key):
        return tuple(key.split("###"))

    overrides = {"make_key": "custom_make_key", "split_key": "custom_split_key"}
    test_vector = numpy.asarray([1, 2, 3, 4], dtype=numpy.float32)
    data = [("clear", "NOUN", 100), ("clear", "VERB", 200), ("clear", "ADJ", 300)]
    s2v = Sense2Vec(shape=(len(data), 4), overrides=overrides)
    for word, sense, freq in data:
        s2v.add(custom_make_key(word, sense), test_vector, freq)
        s2v.cfg["senses"].append(sense)
    assert "clear###NOUN" in s2v
    other_senses = s2v.get_other_senses("clear###NOUN")
    assert len(other_senses) == 2
    assert "clear###VERB" in other_senses
    assert "clear###ADJ" in other_senses
    assert s2v.get_best_sense("clear") == "clear###ADJ"
github explosion / sense2vec / tests / test_sense2vec.py View on Github external
def test_sense2vec_most_similar():
    s2v = Sense2Vec(shape=(6, 4))
    s2v.add("a", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32))
    s2v.add("b", numpy.asarray([4, 4, 2, 2], dtype=numpy.float32))
    s2v.add("c", numpy.asarray([4, 4, 4, 2], dtype=numpy.float32))
    s2v.add("d", numpy.asarray([4, 4, 4, 4], dtype=numpy.float32))
    s2v.add("x", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32))
    s2v.add("y", numpy.asarray([0.1, 1, 1, 1], dtype=numpy.float32))
    result1 = s2v.most_similar(["x"], n=2)
    assert len(result1) == 2
    assert result1[0][0] == "a"
    assert result1[0][1] == 1.0
    assert result1[0][1] == pytest.approx(1.0)
    assert result1[1][0] == "b"
    result2 = s2v.most_similar(["a", "x"], n=2)
    assert len(result2) == 2
    assert sorted([key for key, _ in result2]) == ["b", "d"]
    result3 = s2v.most_similar(["a", "b"], n=3)
github explosion / sense2vec / tests / test_model.py View on Github external
def s2v():
    data_path = Path(__file__).parent / "data"
    return Sense2Vec().from_disk(data_path)
github explosion / sense2vec / tests / test_sense2vec.py View on Github external
def test_sense2vec_freqs():
    s2v = Sense2Vec(shape=(10, 4))
    vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)
    s2v.add("test1", vector, 123)
    s2v.add("test2", vector, 456)
    assert len(s2v.freqs) == 2
    assert s2v.get_freq("test1") == 123
    assert s2v.get_freq("test2") == 456
    assert s2v.get_freq("test3") is None
    assert s2v.get_freq("test3", 100) == 100
    s2v.set_freq("test3", 200)
    assert s2v.get_freq("test3") == 200
github explosion / sense2vec / tests / test_sense2vec.py View on Github external
def test_sense2vec_best_sense():
    s2v = Sense2Vec(shape=(5, 4))
    s2v.cfg["senses"] = ["A", "B", "C"]
    for key, freq in [("a|A", 100), ("a|B", 50), ("a|C", 10), ("b|A", 1), ("B|C", 2)]:
        s2v.add(key, numpy.asarray([4, 2, 2, 2], dtype=numpy.float32), freq)
    assert s2v.get_best_sense("a") == "a|A"
    assert s2v.get_best_sense("b") == "B|C"
    assert s2v.get_best_sense("b", ignore_case=False) == "b|A"
    assert s2v.get_best_sense("c") is None
    s2v.cfg["senses"] = []
    assert s2v.get_best_sense("a") is None
    assert s2v.get_best_sense("b", ["A"]) == "b|A"
    assert s2v.get_best_sense("b", ["A", "C"]) == "B|C"
github explosion / sense2vec / tests / test_sense2vec.py View on Github external
def test_sense2vec_object():
    s2v = Sense2Vec(shape=(10, 4))
    assert s2v.vectors.shape == (10, 4)
    assert len(s2v) == 10
    test_vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)
    s2v.add("test", test_vector)
    assert "test" in s2v
    assert isinstance(s2v.strings["test"], int)
    assert s2v.strings["test"] in s2v
    assert "foo" not in s2v
    assert numpy.array_equal(s2v["test"], test_vector)
    assert numpy.array_equal(s2v[s2v.strings["test"]], test_vector)
    assert list(s2v.keys()) == ["test"]
    s2v.add("test2", test_vector)
    assert "test2" in s2v
    assert sorted(list(s2v.keys())) == ["test", "test2"]
    with pytest.raises(ValueError):
        s2v["test3"] = test_vector
github explosion / sense2vec / tests / test_sense2vec.py View on Github external
def test_sense2vec_to_from_bytes():
    s2v = Sense2Vec(shape=(2, 4))
    test_vector1 = numpy.asarray([1, 2, 3, 4], dtype=numpy.float32)
    test_vector2 = numpy.asarray([5, 6, 7, 8], dtype=numpy.float32)
    s2v.add("test1", test_vector1, 123)
    s2v.add("test2", test_vector2, 456)
    s2v_bytes = s2v.to_bytes()
    new_s2v = Sense2Vec().from_bytes(s2v_bytes)
    assert len(new_s2v) == 2
    assert new_s2v.vectors.shape == (2, 4)
    assert "test1" in new_s2v
    assert "test2" in new_s2v
    assert new_s2v.get_freq("test1") == 123
    assert new_s2v.get_freq("test2") == 456
    assert numpy.array_equal(new_s2v["test1"], test_vector1)
    assert numpy.array_equal(new_s2v["test2"], test_vector2)
    assert s2v_bytes == new_s2v.to_bytes()
    s2v_bytes2 = s2v.to_bytes(exclude=["strings"])
    new_s2v2 = Sense2Vec().from_bytes(s2v_bytes2)
    assert len(new_s2v2.strings) == 0
    assert "test1" in new_s2v2
    assert s2v.strings["test1"] in new_s2v2
    with pytest.raises(KeyError):  # can't resolve hash
        new_s2v2.strings[s2v.strings["test2"]]
github explosion / sense2vec / sense2vec / prodigy_recipes.py View on Github external
senses=None,
    exclude_senses=EVAL_EXCLUDE_SENSES,
    n_freq=100_000,
    n_similar=10,
    batch_size=5,
    eval_whole=False,
    eval_only=False,
    show_scores=False,
):
    """
    Evaluate a vectors model by looking at the most similar entries it returns
    for a random phrase and unselecting the mistakes.
    """
    log("RECIPE: Starting recipe sense2vec.eval-most-similar", locals())
    random.seed(0)
    s2v = Sense2Vec().from_disk(vectors_path)
    log("RECIPE: Loaded sense2vec vectors", vectors_path)
    seen = set()
    DB = connect()
    if dataset in DB:
        examples = DB.get_dataset(dataset)
        seen.update([eg["text"] for eg in examples if eg["answer"] == "accept"])
        log(f"RECIPE: Skipping {len(seen)} terms already in dataset")

    def get_html(key, score=None, large=False):
        word, sense = s2v.split_key(key)
        html_word = f"<span style="font-size: {30 if large else 20}px">{word}</span>"
        html_sense = f"<strong style="opacity: 0.75; font-size: 14px; padding-left: 10px">{sense}</strong>"
        html = f"{html_word} {html_sense}"
        if show_scores and score is not None:
            html += f" <span style="opacity: 0.75; font-size: 12px; padding-left: 10px">{score:.4}</span>"
        return html
github explosion / sense2vec / sense2vec / prodigy_recipes.py View on Github external
eval_whole=False,
    eval_only=False,
    show_scores=False,
):
    """
    Evaluate a sense2vec model by asking about phrase triples: is word A more
    similar to word B, or to word C? If the human mostly agrees with the model,
    the vectors model is good.
    """
    random.seed(0)
    log("RECIPE: Starting recipe sense2vec.eval", locals())
    strategies = eval_strategies.get_all()
    if strategy not in strategies.keys():
        err = f"Invalid strategy '{strategy}'. Expected: {list(strategies.keys())}"
        msg.fail(err, exits=1)
    s2v = Sense2Vec().from_disk(vectors_path)
    log("RECIPE: Loaded sense2vec vectors", vectors_path)

    def get_html(key, score=None, large=False):
        word, sense = s2v.split_key(key)
        html_word = f"<span style="font-size: {30 if large else 20}px">{word}</span>"
        html_sense = f"<strong style="opacity: 0.75; font-size: 14px; padding-left: 10px">{sense}</strong>"
        html = f"{html_word} {html_sense}"
        if show_scores and score is not None:
            html += f" <span style="opacity: 0.75; font-size: 12px; padding-left: 10px">{score:.4}</span>"
        return html

    def get_stream():
        strategy_func = eval_strategies.get(strategy)
        log(f"RECIPE: Using strategy {strategy}")
        # Limit to most frequent entries
        keys = [key for key, _ in s2v.frequencies[:n_freq]]
github explosion / sense2vec / scripts / train.py View on Github external
w2v_model.train(
        sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.iter
    )
    print("Creating the sense2vec model...")
    vectors = []
    all_senses = set()
    for string in w2v_model.wv.vocab:
        vocab = w2v_model.wv.vocab[string]
        freq, idx = vocab.count, vocab.index
        if freq &lt; min_count:
            continue
        vector = w2v_model.wv.vectors[idx]
        vectors.append((string, freq, vector))
        _, sense = split_key(string)
        all_senses.add(sense)
    s2v = Sense2Vec(shape=(len(vectors), size), senses=all_senses)
    for string, freq, vector in vectors:
        s2v.add(string, vector, freq)
    print("Saving the model...")
    s2v.to_disk(out_dir)
    print(f"Saved model to directory: {out_dir}")