Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __init__(self, cfg):
# TODO casing is ignored so far
if 'word2vec_model' not in cfg:
raise Exception('Need loaded word2vec model')
self._w2v = gensim.models.Word2Vec.load_word2vec_format(cfg['word2vec_model'], binary=True)
self.freq_threshold = cfg.get('emb_freq_threshold', 2)
self.max_sent_len = cfg.get('max_sent_len', 50)
self.reverse = cfg.get('reverse', False)
def load_initial_emb():
initial_emb = gensim.models.Word2Vec.load_word2vec_format("/home/pinkesh/DATASETS/glove-twitter/GENSIM.glove.twitter.27B.200d.txt")
return initial_emb
if __name__ == "__main__":
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
# download from https://drive.google.com/open?id=0B1GKSX6YCHXlakkzQ2plZVdUUE0
model = dir_path + '/data/wiki.vi.model.bin'
if os.path.isfile(model):
print 'Loading word2vec model ...'
if LooseVersion(gensim.__version__) >= LooseVersion("1.0.1"):
from gensim.models import KeyedVectors
word2vec_model = KeyedVectors.load_word2vec_format(model, binary=True)
else:
from gensim.models import Word2Vec
word2vec_model = Word2Vec.load_word2vec_format(model, binary=True)
app.run(port=8089)
else:
print "Download word2vec model and put into ./data/. File: https://drive.google.com/open?id=0B1GKSX6YCHXlakkzQ2plZVdUUE0"
Recieves all sentences in MPQA and EPOS.
Builds a vocabulary mapping from word to index based on the sentences.
Returns vocabulary mapping and inverse vocabulary mapping.
"""
print ("Building vocabulary...")
# Build vocabulary
word_counts = Counter(itertools.chain(*sentences))
# Mapping from index to word
vocabulary_inv = [x[0] for x in word_counts.most_common()]
# Mapping from word to index
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
if(vector == 'w2v'):
print ("Loading w2v model...")
model = models.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary = True)
print ("Building embeddings...")
vocab_size = len(vocabulary)
embeddings = np.zeros((vocab_size, 300))
for word in vocabulary:
index = vocabulary[word]
try:
embeddings[index, :] = model[word].reshape((1,300))
except KeyError:
embeddings[index, :] = np.random.uniform(-0.23, 0.23, [1,300])
print ("Write data in a pickle...")
pickle_file = 'w2v.pickle'
try:
fp = open(pickle_file, 'wb')
save = {
def load_model_skipgram(model_path):
""" Load the skipgram model from a file in word2vec format. """
return gensim.models.Word2Vec.load_word2vec_format(model_path)
"Somalia":"SOM", "South_Africa":"ZAF",
"South_Korea":"KOR", "South Sudan":"SSD", "Spain":"ESP", "Sri_Lanka":"LKA", "Sudan":"SDN",
"Suriname":"SUR", "Svalbard Jan Mayen":"SJM",
"Swaziland":"SWZ", "Sweden":"SWE", "Switzerland":"CHE", "Syria":"SYR",
"Taiwan":"TWN", "Tajikistan":"TJK", "Tanzania":"TZA", "Thailand":"THA",
"Timor Leste":"TLS", "East_Timor":"TLS","Togo":"TGO", "Tokelau":"TKL", "Tonga":"TON", "Trinidad Tobago":"TTO",
"Tunisia":"TUN", "Turkey":"TUR",
"Turkmenistan":"TKM", "Turks Caicos Islands":"TCA", "Tuvalu":"TUV", "U.S. Minor Outlying Islands":"UMI",
"Virgin_Islands":"VIR", "Uganda":"UGA",
"Ukraine":"UKR", "United_Arab_Emirates":"ARE", "United_Kingdom":"GBR",
"UK":"GBR", "United_States":"USA", "USA":"USA", "America":"USA",
"Uruguay":"URY", "Uzbekistan":"UZB", "Vanuatu":"VUT", "Vatican":"VAT", "Venezuela":"VEN",
"Vietnam":"VNM", "Wallis Futuna":"WLF",
"Western_Sahara":"ESH", "Yemen":"YEM", "Zambia":"ZMB", "Zimbabwe":"ZWE"}
prebuilt = Word2Vec.load_word2vec_format(word2vec_model, binary=True)
vocab_set = set(prebuilt.vocab.keys())
countries = stopword_country_names.keys()
idx_country_mapping = {}
index = numpy.empty(shape=(len(countries), 300), dtype=dtype)
for idx, country in enumerate(countries):
country = unidecode(country)
try:
vector = prebuilt[country]
except KeyError:
pass
index[idx] = vector
idx_country_mapping[idx] = stopword_country_names[country]
import argparse, sys, numpy
from gensim.models import Word2Vec
parser = argparse.ArgumentParser(
description='Generate a .tsv of word2vec vectors for a word list.')
parser.add_argument('-i', '--input', default='data')
parser.add_argument('-m', '--model', default='models/GoogleNews-vectors-negative300.bin')
args = parser.parse_args()
print('Loading wordlist from {}/wordlist'.format(args.input))
wordlist = numpy.genfromtxt('{}/wordlist'.format(args.input), dtype='str')
words = []
vectors = []
print('Loading model from ' + args.model)
model = Word2Vec.load_word2vec_format(args.model, binary=True)
print('Looking up {} words.'.format(len(wordlist)))
for word in wordlist:
if word in model:
print('added: {}'.format(word))
words.append(word)
vectors.append(model[word])
else:
print('no vector: {}'.format(word))
print('Saving {:.2%} of the words.'.format(len(words) / len(wordlist)))
numpy.savetxt('{}/words'.format(args.input), words, fmt='%s')
print('Saving word vectors.')
numpy.savetxt('{}/vectors'.format(args.input), vectors, fmt='%.8f', delimiter='\t')
def main():
parser = argparse.ArgumentParser()
parser.add_argument("model", help="word2vec model path")
parser.add_argument("format", help="1 = binary format, 0 = text format", type=int)
parser.add_argument("k", help="number of clusters", type=int)
parser.add_argument("output", help="output file")
args = parser.parse_args()
start = time.time()
print("Load word2vec model ... ", end="", flush=True)
w2v_model = Word2Vec.load_word2vec_format(args.model, binary=bool(args.format))
print("finished in {:.2f} sec.".format(time.time() - start), flush=True)
word_vectors = w2v_model.wv.syn0
n_words = word_vectors.shape[0]
vec_size = word_vectors.shape[1]
print("#words = {0}, vector size = {1}".format(n_words, vec_size))
start = time.time()
print("Compute clustering ... ", end="", flush=True)
kmeans = KMeans(n_clusters=args.k, n_jobs=-1, random_state=0)
idx = kmeans.fit_predict(word_vectors)
print("finished in {:.2f} sec.".format(time.time() - start), flush=True)
start = time.time()
print("Generate output file ... ", end="", flush=True)
word_centroid_list = list(zip(w2v_model.wv.index2word, idx))
word_centroid_list_sort = sorted(word_centroid_list, key=lambda el: el[1], reverse=False)
raise Exception("Embedding dictionary file not found!")
# Load model dictionary
model_dict = cPickle.load(open(args.model_dictionary, 'r'))
str_to_idx = dict([(tok, tok_id) for tok, tok_id, _, _ in model_dict])
i_dim = len(str_to_idx.keys())
logger.info("Vocabulary size: %d" % i_dim)
word_freq = dict([(tok_id, freq) for _, tok_id, freq, _ in model_dict])
# Load pretrained word embeddings
if uses_word2vec:
import gensim, logging
embedding_dict = gensim.models.Word2Vec.load_word2vec_format(args.embedding_dictionary, binary=True)
else:
embedding_dict = cPickle.load(open(args.embedding_dictionary, "rb" ) )
if uses_word2vec:
raw_emb_dim = embedding_dict['hello'].shape[0]
else:
raw_emb_dim = embedding_dict[embedding_dict.keys()[0]].shape[0]
logger.info("Raw word embedding dim: %d" % raw_emb_dim)
W_emb_raw = numpy.zeros((i_dim, raw_emb_dim))
words_found = 0
unique_word_indices_found = []
unique_words_left_out = []
''' Generate a dataset of (input, output) pairs where the
input is an embedded vector and output is
an embedded vector for the lemmatized form.
Args
----
sentences : list
list of sentences where each sentence is list of tokens
max_words : integer
maximum number of words allowed in sentence
train_test_split : boolean
whether to split data into 2 sets
'''
num_sentences = len(sentences)
model = models.Word2Vec.load_word2vec_format(
'../storage/GoogleNews-vectors-negative300.bin',
binary=True)
vectorizer = lambda x: model[x] if x in model else np.ones(300)*ZERO_EPSILON
lemmatizer = WordNetLemmatizer().lemmatize
X = np.zeros((num_sentences, max_words, 300))
y = np.zeros((num_sentences, max_words, 300))
K = np.zeros(num_sentences)
I = np.arange(num_sentences)
param_dict = {}
param_dict['max_words'] = max_words
for sent_i, words in enumerate(sentences):
if sent_i % 1000 == 0:
print("{} sentences parsed. {} remaining.".format(