How to use the gensim.models.KeyedVectors function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github kmpoon / hlta / scripts / compactness_w2v.py View on Github external
def compactness_score(model_path, topic_file_path, with_gensim = True):
	"""
	model_path:	Word2Vec model file
	topic_file_path:Each line in the file is a topic, represented as 
			a list of words separated by spaces
			
	Output:		Print compactness score for each topic and a final score for all the topics.
	"""

	# Loading can be very slow if the model is large. 
	# User should consider loading the model just once for all the topic files.
	print("Loading Word2Vec model: " + model_path)
	import gensim
	model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
	print("Loading Done.")

	print("Processing topic file: " + topic_file_path)

	line_count = 0
	result = []
	with open(topic_file_path, 'r') as inputfile:
		for line in inputfile:
			line_count += 1
			sims = []

			line = line.strip(' \n').split(' ')
			print(line),
			for i in range(len(line)):
				if line[i] not in model.vocab:
					continue
github AnubhavGupta3377 / Text-Classification-Models-Pytorch / Model_TextCNN / old_code / utils.py View on Github external
Returns:
        word_embeddings : Dictionary mapping each word to corresponding embedding
    '''

    word_embeddings = {}
    if w2vfile.endswith('.txt'):
        f = open(w2vfile)
        for line in tqdm(f):
            values = line.split(" ")
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            if word in word_to_index:
                word_embeddings[word] = coefs
        f.close()
    elif w2vfile.endswith('.bin'):
        word2vec = KeyedVectors.load_word2vec_format(w2vfile, binary=True, limit=1000000)
        for word in tqdm(word_to_index):
            try:
                word_embeddings[word] = word2vec[word.lower()]
            except KeyError:
                pass
    else:
        print ('Can\'t load word embeddings.')
        exit(-1)

    print('Found {0}/{1} word vectors.'.format(len(word_embeddings), len(word_to_index)))
    if len(word_to_index) > len(word_embeddings):
        print('Initializing remaining {} word vectors with zeros.'.format(len(word_to_index) - len(word_embeddings)))

    for word in word_to_index:
        if word not in word_embeddings:
            word_embeddings[word] = np.zeros((embedsize,))
github vietnlp / etnlp / src / codes / api / embedding_evaluator.py View on Github external
def check_oov_of_word_analogies(w2v_format_emb_file, analogy_file, is_vn=True, case_sensitive=True):
    emb_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_format_emb_file,
        binary=False,
        unicode_errors='ignore')

    f_reader = open(analogy_file, "r")
    vocab_arr = []
    for line in f_reader:
        if not case_sensitive:
            line = line.lower()

        if line.startswith(': '):
            continue
        else:
            for word in line.split(" | "):
                # In Vietnamese, we have compound and single word.
                # if is_vn:
                #     if " " in word:
github jiangxinyang227 / NLP-Project / few_shot_learning / siamese_network / data_helper.py View on Github external
def get_word_vectors(self, vocab: List[str]) -> np.ndarray:
        """
        load word vector file,
        :param vocab: vocab
        :return:
        """
        pad_vector = np.zeros(self.__embedding_size)  # set the "" vector to 0
        word_vectors = (1 / np.sqrt(len(vocab) - 1) * (2 * np.random.rand(len(vocab) - 1, self.__embedding_size) - 1))
        if os.path.splitext(self.__word_vector_path)[-1] == ".bin":
            word_vec = gensim.models.KeyedVectors.load_word2vec_format(self.__word_vector_path, binary=True)
        else:
            word_vec = gensim.models.KeyedVectors.load_word2vec_format(self.__word_vector_path)

        for i in range(1, len(vocab)):
            try:
                vector = word_vec.wv[vocab[i]]
                word_vectors[i, :] = vector
            except:
                print(vocab[i] + "not exist word vector file")
        word_vectors = np.vstack((pad_vector, word_vectors))
        return word_vectors
github k-fujikawa / Kaggle-Quora-Insincere-Questions-Classification / qiqc / embeddings / pretrained_vector.py View on Github external
def load(cls, tokens, limit=None):
        return KeyedVectors.load_word2vec_format(
            cls.path, binary=True, limit=limit)
github lxw0109 / NLP-Experiments / word2vec / src / 3_word2vec_similarity.py View on Github external
def _load_model(self, model_file="../data/corpus.model.bin", binary=False):
        """
        Load model with C format word2vec file.
        """
        if not os.path.exists(model_file):
            raise Exception("Model file does not exist.")
        model = gensim.models.KeyedVectors.load_word2vec_format(model_file, binary=binary, unicode_errors="ignore")
        return model
github shivasuri / semantic_textual_similarity / main.py View on Github external
for i in range(1,n+1):
        D[i][0] = D[i-1][0] + 1
    for j in range(1,m+1):
        D[0][j] = D[0][j-1] + 1
    for i in range(1,n+1):
        for j in range(1,m+1):
            D[i][j] = min(D[i-1][j]+1, D[i-1][j-1]+sub_cost(str1[i-1],str2[j-1]), D[i][j-1]+1)
    return D[n][m]

if __name__ == '__main__':
    args = parser.parse_args()
    pp.pprint(args)

    # Load the training data
    vecfile = 'GoogleNews-vectors-negative300.bin'
    vecs = KeyedVectors.load_word2vec_format(vecfile, binary=True)

    with open(args.inputfile, 'r') as inputfile:
        input = inputfile.readlines()

    output_simple = open("pred_simple.txt", "w")
    output_w2v = open("pred_ex1.txt", "w")

    with open("data/en-train.txt", 'r') as train_file:
        train_input = train_file.readlines()

    with open("data/en-val.txt", 'r') as val_file:
        val = val_file.readlines()

    with open("data/en-test.txt", 'r') as test_file:
        test = test_file.readlines()
github uhh-lt / sensegram / fast_top_nn / similar_top.py View on Github external
def run(vectors_fpath, output_fpath="", only_letters=False, vocab_limit=None, pairs=False, batch_size=1000, threads_num=4, word_freqs=None):
    print("Vectors: {}, only_letters: {}".format(vectors_fpath, only_letters), file=stderr)
    print("Loading vectors from {}".format(vectors_fpath), file=stderr)
    tic = time()
    vectors = gensim.models.KeyedVectors.load_word2vec_format(
        vectors_fpath, binary=False, unicode_errors='ignore')
    vectors.init_sims(replace=True)

    print("Vectors loaded in %d sec." % (time()-tic), file=stderr)
    print("Vectors shape is: ", vectors.syn0norm.shape, file=stderr)

    vocab_size = len(vectors.vocab)
    print(("Vocabulary size: %i" % vocab_size))
    
    # Limit the number of words for which to collect neighbours
    if vocab_limit and vocab_limit < vocab_size:
        vocab_size = vocab_limit
    words = vectors.index2word[:vocab_size]
    
    print(("Collect neighbours for %i most frequent words" % vocab_size))
github ShimShim46 / HFT-CNN / data_helper.py View on Github external
try:
        model = FastText.load_fasttext_format(embedding_weights_path)
        pre_trained_embedding = "bin"
    except:
        print ("fastText binary file (.bin) is not found!")
        if os.path.exists("./Word_embedding/wiki.en.vec"):
            print ("Using wikipedia(en) pre-trained word vectors.")
        else:
            print ("Downloading wikipedia(en) pre-trained word vectors.")
            chakin.download(number=2, save_dir="./Word_embedding")
        print ("Loading vectors...")
        if os.path.exists("./Word_embedding_model.pkl"):
            with open("./Word_embedding_model.pkl", mode="rb") as f:
                model = pickle.load(f)
        else:
            model =  KeyedVectors.load_word2vec_format('./Word_embedding/wiki.en.vec')
            with open("Word_embedding_model.pkl", mode="wb") as f:
                pickle.dump(model, f)
        pre_trained_embedding = "txt"

    vocab_size = len(words_map)
    word_dimension = model['a'].shape[0]
    w = np.zeros((vocab_size,word_dimension),dtype=np.float32)

    for k,v in words_map.items():
        word = k
        word_number = v
        
        try:
                w[word_number][:] = model[word]
        except KeyError as e:
                if pre_trained_embedding == "bin":
github elitcloud / elit / elit / dev / pos_tagger.py View on Github external
def main():
    # arguments
    args = parse_args()
    if args.log: logging.basicConfig(filename=args.log, format='%(message)s', level=logging.INFO)
    else: logging.basicConfig(format='%(message)s', level=logging.INFO)

    # x
    trn_graphs = read_graphs(args.tsv, args.trn_data)
    dev_graphs = read_graphs(args.tsv, args.dev_data)

    # lexicon
    w2v = KeyedVectors.load_word2vec_format(args.w2v, binary=True) if args.w2v else None
    f2v = fasttext.load_model(args.f2v) if args.f2v else None
    a2v = KeyedVectors.load_word2vec_format(args.a2v, binary=True) if args.a2v else None

    lexicon = POSLexicon(w2v=w2v, f2v=f2v, a2v=a2v, output_size=args.output_size)

    # model
    model = POSModel(feature_context=args.feature_context, batch_size=64, w2v_dim=100)
    model.train(trn_graphs, dev_graphs, lexicon, num_steps=args.num_steps,
                bagging_ratio=args.bagging_ratio, optimizer=args.optimizer, force_init=True)