How to use the sense2vec.util.split_key function in sense2vec

To help you get started, we’ve selected a few sense2vec examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github explosion / sense2vec / tests / test_util.py View on Github external
def test_make_split_key(word, sense, expected):
    assert make_key(word, sense) == expected
    assert split_key(expected) == (word, sense)
github explosion / sense2vec / scripts / 05_export.py View on Github external
msg.fail("Can't find vocab file", vocab_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    with input_path.open("r", encoding="utf8") as f:
        (n_vectors, vector_size), f = _get_shape(f)
        vectors_data = f.readlines()
    with vocab_path.open("r", encoding="utf8") as f:
        vocab = read_vocab(f)
    vectors = {}
    all_senses = set()
    for item in vectors_data:
        item = item.rstrip().rsplit(" ", vector_size)
        key = item[0]
        try:
            _, sense = split_key(key)
        except ValueError:
            continue
        vec = item[1:]
        if len(vec) != vector_size:
            msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1)
        all_senses.add(sense)
        vectors[key] = numpy.asarray(vec, dtype=numpy.float32)
    discarded = set()
    discarded.update(get_minority_keys(vocab, min_freq_ratio))
    discarded.update(get_redundant_keys(vocab, vectors, min_distance))
    n_vectors = len(vectors) - len(discarded)
    s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses)
    for key, vector in vectors.items():
        if key not in discarded:
            s2v.add(key, vector)
            s2v.set_freq(key, vocab[key])
github explosion / sense2vec / scripts / 05_export.py View on Github external
def get_redundant_keys(vocab, vectors, min_distance):
    if min_distance <= 0.0:
        return []
    by_word = defaultdict(list)
    for key, freq in vocab.items():
        try:
            term, sense = split_key(key)
        except ValueError:
            continue
        term = term.split("_")[-1]
        by_word[term.lower()].append((freq, key))
    too_similar = []
    for values in by_word.values():
        if len(values) >= 2:
            values.sort(reverse=True)
            freq1, key1 = values[0]
            vector1 = vectors[key1]
            for freq2, key2 in values[1:]:
                vector2 = vectors[key2]
                sim = cosine_similarity(vector1, vector2)
                if sim >= (1 - min_distance):
                    too_similar.append(key2)
    return too_similar
github explosion / sense2vec / scripts / train.py View on Github external
w2v_model.build_vocab(sentences)
    print("Training the model...")
    w2v_model.train(
        sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.iter
    )
    print("Creating the sense2vec model...")
    vectors = []
    all_senses = set()
    for string in w2v_model.wv.vocab:
        vocab = w2v_model.wv.vocab[string]
        freq, idx = vocab.count, vocab.index
        if freq < min_count:
            continue
        vector = w2v_model.wv.vectors[idx]
        vectors.append((string, freq, vector))
        _, sense = split_key(string)
        all_senses.add(sense)
    s2v = Sense2Vec(shape=(len(vectors), size), senses=all_senses)
    for string, freq, vector in vectors:
        s2v.add(string, vector, freq)
    print("Saving the model...")
    s2v.to_disk(out_dir)
    print(f"Saved model to directory: {out_dir}")
github explosion / sense2vec / scripts / 05_export.py View on Github external
def get_minority_keys(freqs, min_ratio):
    """Remove keys that are too infrequent relative to a main sense."""
    by_word = defaultdict(list)
    for key, freq in freqs.items():
        try:
            term, sense = split_key(key)
        except ValueError:
            continue
        if freq:
            by_word[term.lower()].append((freq, key))
    discarded = []
    for values in by_word.values():
        if len(values) >= 2:
            values.sort(reverse=True)
            freq1, key1 = values[0]
            for freq2, key2 in values[1:]:
                ratio = freq2 / freq1
                if ratio < min_ratio:
                    discarded.append(key2)
    return discarded