How to use the gensim.models.Word2Vec.load_word2vec_format function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

github tuetschek / ratpred / ratpred / View on Github external
def __init__(self, cfg):
        # TODO casing is ignored so far
        if 'word2vec_model' not in cfg:
            raise Exception('Need loaded word2vec model')
        self._w2v = gensim.models.Word2Vec.load_word2vec_format(cfg['word2vec_model'], binary=True)
        self.freq_threshold = cfg.get('emb_freq_threshold', 2)
        self.max_sent_len = cfg.get('max_sent_len', 50)
        self.reverse = cfg.get('reverse', False)
github pinkeshbadjatiya / twitter-hatespeech / View on Github external
def load_initial_emb():
    initial_emb = gensim.models.Word2Vec.load_word2vec_format("/home/pinkesh/DATASETS/glove-twitter/GENSIM.glove.twitter.27B.200d.txt")
    return initial_emb
github sonvx / word2vecVN / word2vec-simple-visualization / View on Github external
if __name__ == "__main__":
    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    # download from
    model = dir_path + '/data/'

    if os.path.isfile(model):
        print 'Loading word2vec model ...'
	if LooseVersion(gensim.__version__) >= LooseVersion("1.0.1"):
	    from gensim.models import KeyedVectors
	    word2vec_model = KeyedVectors.load_word2vec_format(model, binary=True)
	    from gensim.models import Word2Vec
            word2vec_model = Word2Vec.load_word2vec_format(model, binary=True)
        print "Download word2vec model and put into ./data/. File:"
github tbmihailov / conll16st-hd-sdp / sdp / utils / View on Github external
Recieves all sentences in MPQA and EPOS.
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.

    print ("Building vocabulary...")
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}

    if(vector == 'w2v'):
        print ("Loading w2v model...")
        model = models.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary = True)

        print ("Building embeddings...")
        vocab_size = len(vocabulary)
        embeddings = np.zeros((vocab_size, 300))
        for word in vocabulary:
        	index = vocabulary[word]
        		embeddings[index, :] = model[word].reshape((1,300))
        	except KeyError:
        		embeddings[index, :] = np.random.uniform(-0.23, 0.23, [1,300])

        print ("Write data in a pickle...")
        pickle_file = 'w2v.pickle'
            fp = open(pickle_file, 'wb')
            save = {
github viveksck / langchangetrack / langchangetrack / tsconstruction / distributional / scripts / View on Github external
def load_model_skipgram(model_path):
    """ Load the skipgram model from a file in word2vec format. """
    return gensim.models.Word2Vec.load_word2vec_format(model_path)
github openeventdata / mordecai / View on Github external
"Somalia":"SOM", "South_Africa":"ZAF",
    "South_Korea":"KOR", "South Sudan":"SSD", "Spain":"ESP", "Sri_Lanka":"LKA", "Sudan":"SDN",
    "Suriname":"SUR", "Svalbard Jan Mayen":"SJM",
    "Swaziland":"SWZ", "Sweden":"SWE", "Switzerland":"CHE", "Syria":"SYR",
    "Taiwan":"TWN", "Tajikistan":"TJK", "Tanzania":"TZA", "Thailand":"THA",
    "Timor Leste":"TLS", "East_Timor":"TLS","Togo":"TGO", "Tokelau":"TKL", "Tonga":"TON", "Trinidad Tobago":"TTO",
    "Tunisia":"TUN", "Turkey":"TUR",
    "Turkmenistan":"TKM", "Turks Caicos Islands":"TCA", "Tuvalu":"TUV", "U.S. Minor Outlying Islands":"UMI",
    "Virgin_Islands":"VIR", "Uganda":"UGA",
    "Ukraine":"UKR", "United_Arab_Emirates":"ARE", "United_Kingdom":"GBR",
    "UK":"GBR", "United_States":"USA", "USA":"USA", "America":"USA",
    "Uruguay":"URY", "Uzbekistan":"UZB", "Vanuatu":"VUT", "Vatican":"VAT", "Venezuela":"VEN",
    "Vietnam":"VNM", "Wallis Futuna":"WLF",
    "Western_Sahara":"ESH", "Yemen":"YEM", "Zambia":"ZMB", "Zimbabwe":"ZWE"}

prebuilt = Word2Vec.load_word2vec_format(word2vec_model, binary=True)
vocab_set = set(prebuilt.vocab.keys())

countries = stopword_country_names.keys()

idx_country_mapping = {}
index = numpy.empty(shape=(len(countries), 300), dtype=dtype)

for idx, country in enumerate(countries):
    country = unidecode(country)
        vector = prebuilt[country]
    except KeyError:
    index[idx] = vector
    idx_country_mapping[idx] = stopword_country_names[country]
github kylemcdonald / EmbeddingScripts / View on Github external
import argparse, sys, numpy
from gensim.models import Word2Vec

parser = argparse.ArgumentParser(
	description='Generate a .tsv of word2vec vectors for a word list.')
parser.add_argument('-i', '--input', default='data')
parser.add_argument('-m', '--model', default='models/GoogleNews-vectors-negative300.bin')
args = parser.parse_args()

print('Loading wordlist from {}/wordlist'.format(args.input))
wordlist = numpy.genfromtxt('{}/wordlist'.format(args.input), dtype='str')
words = []
vectors = []
print('Loading model from ' + args.model)
model = Word2Vec.load_word2vec_format(args.model, binary=True)
print('Looking up {} words.'.format(len(wordlist)))
for word in wordlist:
	if word in model:
		print('added: {}'.format(word))
		print('no vector: {}'.format(word))
print('Saving {:.2%} of the words.'.format(len(words) / len(wordlist)))
numpy.savetxt('{}/words'.format(args.input), words, fmt='%s')
print('Saving word vectors.')
numpy.savetxt('{}/vectors'.format(args.input), vectors, fmt='%.8f', delimiter='\t')
github gaetangate / word2vec-cluster / View on Github external
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("model", help="word2vec model path")
    parser.add_argument("format", help="1 = binary format, 0 = text format", type=int)
    parser.add_argument("k", help="number of clusters", type=int)
    parser.add_argument("output", help="output file")
    args = parser.parse_args()

    start = time.time()
    print("Load word2vec model ... ", end="", flush=True)
    w2v_model = Word2Vec.load_word2vec_format(args.model, binary=bool(args.format))
    print("finished in {:.2f} sec.".format(time.time() - start), flush=True)
    word_vectors = w2v_model.wv.syn0
    n_words = word_vectors.shape[0]
    vec_size = word_vectors.shape[1]
    print("#words = {0}, vector size = {1}".format(n_words, vec_size))

    start = time.time()
    print("Compute clustering ... ", end="", flush=True)
    kmeans = KMeans(n_clusters=args.k, n_jobs=-1, random_state=0)
    idx = kmeans.fit_predict(word_vectors)
    print("finished in {:.2f} sec.".format(time.time() - start), flush=True)

    start = time.time()
    print("Generate output file ... ", end="", flush=True)
    word_centroid_list = list(zip(w2v_model.wv.index2word, idx))
    word_centroid_list_sort = sorted(word_centroid_list, key=lambda el: el[1], reverse=False)
github julianser / hed-dlg-truncated / View on Github external
raise Exception("Embedding dictionary file not found!")

# Load model dictionary
model_dict = cPickle.load(open(args.model_dictionary, 'r'))

str_to_idx = dict([(tok, tok_id) for tok, tok_id, _, _ in model_dict])
i_dim = len(str_to_idx.keys())"Vocabulary size: %d" % i_dim)

word_freq = dict([(tok_id, freq) for _, tok_id, freq, _ in model_dict])

# Load pretrained word embeddings
if uses_word2vec:
    import gensim, logging
    embedding_dict = gensim.models.Word2Vec.load_word2vec_format(args.embedding_dictionary, binary=True)
    embedding_dict = cPickle.load(open(args.embedding_dictionary, "rb" ) )

if uses_word2vec:
    raw_emb_dim = embedding_dict['hello'].shape[0]
    raw_emb_dim = embedding_dict[embedding_dict.keys()[0]].shape[0]"Raw word embedding dim: %d" % raw_emb_dim)

W_emb_raw = numpy.zeros((i_dim, raw_emb_dim))

words_found = 0
unique_word_indices_found = []

unique_words_left_out = []
github mhw32 / adaware-nlp / lemmatizer / View on Github external
''' Generate a dataset of (input, output) pairs where the
        input is an embedded vector and output is
        an embedded vector for the lemmatized form.

        sentences : list
                    list of sentences where each sentence is list of tokens
        max_words : integer
                    maximum number of words allowed in sentence
        train_test_split : boolean
                           whether to split data into 2 sets

    num_sentences = len(sentences)
    model = models.Word2Vec.load_word2vec_format(
    vectorizer = lambda x: model[x] if x in model else np.ones(300)*ZERO_EPSILON
    lemmatizer = WordNetLemmatizer().lemmatize

    X = np.zeros((num_sentences, max_words, 300))
    y = np.zeros((num_sentences, max_words, 300))
    K = np.zeros(num_sentences)
    I = np.arange(num_sentences)

    param_dict = {}
    param_dict['max_words'] = max_words

    for sent_i, words in enumerate(sentences):
        if sent_i % 1000 == 0:
            print("{} sentences parsed. {} remaining.".format(