Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def loadWord2Vec(filename):
"""Loads a word2vec model from a file"""
from gensim.models.keyedvectors import KeyedVectors
return KeyedVectors.load_word2vec_format(filename, binary=True)
def task_set_embedding_matrix(self):
print('stage: set_embedding_matrix')
glove_data = hparams['data_dir'] + hparams['embed_name']
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
embed_size = int(hparams['embed_size'])
embeddings_index = {}
if not os.path.isfile(glove_data) :
self.embedding_matrix = None # np.zeros((len(self.vocab_list),embed_size))
#self.trainable = True
else:
# load embedding
glove_model = KeyedVectors.load_word2vec_format(glove_data, binary=False)
f = open(glove_data)
for line in range(self.output_lang.n_words): #len(self.vocab_list)):
#if line == 0: continue
word = self.output_lang.index2word[line]
# print(word, line)
if word in glove_model.wv.vocab:
#print('fill with values',line)
values = glove_model.wv[word]
value = np.asarray(values, dtype='float32')
embeddings_index[word] = value
else:
print('fill with random values',line, word)
value = np.random.uniform(low=self.uniform_low, high=self.uniform_high, size=(embed_size,))
# value = np.zeros((embed_size,))
embeddings_index[word] = value
def load_word2vec(self):
print("loading word2vec...")
word_vectors = KeyedVectors.load_word2vec_format(
"GoogleNews-vectors-negative300.bin", binary=True)
wv_matrix = []
for word in self.data["vocab"]:
if word in word_vectors.vocab:
wv_matrix.append(word_vectors.word_vec(word))
else:
wv_matrix.append(
np.random.uniform(-0.01, 0.01, 300).astype("float32"))
# one for UNK and one for zero padding
wv_matrix.append(np.random.uniform(-0.01, 0.01, 300).astype("float32"))
wv_matrix.append(np.zeros(300).astype("float32"))
wv_matrix = np.array(wv_matrix)
return wv_matrix
print('embedding already loaded.')
return
pass
glove_data = hparams['data_dir'] + hparams['embed_name']
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
embed_size = int(hparams['embed_size'])
embeddings_index = {}
if not os.path.isfile(glove_data) :
self.embedding_matrix = None # np.zeros((len(self.vocab_list),embed_size))
#self.trainable = True
else:
# load embedding
glove_model = KeyedVectors.load_word2vec_format(glove_data, binary=False)
f = open(glove_data)
for line in range(self.output_lang.n_words): #len(self.vocab_list)):
#if line == 0: continue
word = self.output_lang.index2word[line]
# print(word, line)
if word in glove_model.wv.vocab:
#print('fill with values',line)
values = glove_model.wv[word]
value = np.asarray(values, dtype='float32')
embeddings_index[word] = value
else:
print('fill with random values',line, word)
value = np.random.uniform(low=self.uniform_low, high=self.uniform_high, size=(embed_size,))
# value = np.zeros((embed_size,))
embeddings_index[word] = value
def build_T(model_path, sigma):
"""
:param model_path: the path of the final model
:return: (T matrix, word-to-index dictionary for T)
"""
emo2vec = KeyedVectors.load_word2vec_format(model_path, binary=False) # .init_sims(replace=True)
idx2word = dict(enumerate(emo2vec.index2word))
n = 100#len(idx2word)
# invert idx -> word mapping
# word2idx = {w: i for i, w in idx2word.items()}
word2idx = {w: i for i, w in {i: idx2word[i] for i in np.arange(n)}.items()}
t = np.empty((n, n), dtype='float16')
for w1, i in word2idx.items():
for w2, j in word2idx.items():
t[i, j] = to_cosine_dist(emo2vec.similarity(w1, w2)) ** 2
t /= sigma ** 2
t = np.exp(-t)
t = normalize(t, axis=0, norm='l1', copy=False)
def task_set_embedding_matrix(self):
print('stage: set_embedding_matrix')
glove_data = hparams['data_dir'] + hparams['embed_name']
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
embed_size = int(hparams['embed_size'])
embeddings_index = {}
if not os.path.isfile(glove_data) :
self.embedding_matrix = None # np.zeros((len(self.vocab_list),embed_size))
#self.trainable = True
else:
# load embedding
glove_model = KeyedVectors.load_word2vec_format(glove_data, binary=False)
f = open(glove_data)
for line in range(self.output_lang.n_words): #len(self.vocab_list)):
#if line == 0: continue
word = self.output_lang.index2word[line]
# print(word, line)
if word in glove_model.wv.vocab:
#print('fill with values',line)
values = glove_model.wv[word]
value = np.asarray(values, dtype='float32')
embeddings_index[word] = value
else:
print('fill with random values',line, word)
value = np.random.uniform(low=self.uniform_low, high=self.uniform_high, size=(embed_size,))
# value = np.zeros((embed_size,))
embeddings_index[word] = value
print('embedding already loaded.')
return
pass
glove_data = hparams['data_dir'] + hparams['embed_name']
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
embed_size = int(hparams['embed_size'])
embeddings_index = {}
if not os.path.isfile(glove_data) :
self.embedding_matrix = None # np.zeros((len(self.vocab_list),embed_size))
#self.trainable = True
else:
# load embedding
glove_model = KeyedVectors.load_word2vec_format(glove_data, binary=False)
f = open(glove_data)
for line in range(self.output_lang.n_words): #len(self.vocab_list)):
#if line == 0: continue
word = self.output_lang.index2word[line]
# print(word, line)
if word in glove_model.wv.vocab:
#print('fill with values',line)
values = glove_model.wv[word]
value = np.asarray(values, dtype='float32')
embeddings_index[word] = value
else:
print('fill with random values',line, word)
value = np.random.uniform(low=self.uniform_low, high=self.uniform_high, size=(embed_size,))
# value = np.zeros((embed_size,))
embeddings_index[word] = value
def load_word2vec():
print("loading word2vec...")
word_vectors = KeyedVectors.load_word2vec_format(
"{}/GoogleNews-vectors-negative300.bin".format(opt.data_path), binary=True)
wv_matrix = []
if opt.dataset == 'vse':
for idx in sorted(dict.keys(opt.vocab.idx2word)):
word = opt.vocab.idx2word[idx]
if word in word_vectors.vocab:
wv_matrix.append(word_vectors.word_vec(word))
else:
wv_matrix.append(
np.random.uniform(-0.01, 0.01, 300).astype("float32"))
else:
for word in data.vocab:
if word in word_vectors.vocab:
wv_matrix.append(word_vectors.word_vec(word))
def cache_word_embeddings(word_embeddings_file, cache_file):
if not word_embeddings_file.endswith('.gz'):
logger.warning('WARNING: expecting a .gz file. Is the {} in the correct \
format?'.format(word_embeddings_file))
vocab_size, vec_dim = 0, 0
if not os.path.exists(cache_file):
# cache does not exist
if not os.path.exists(os.path.dirname(cache_file)):
# make cache folder if needed
os.mkdir(os.path.dirname(cache_file))
logger.info('caching the word embeddings in np.memmap format')
wv = KeyedVectors.load_word2vec_format(word_embeddings_file, binary=True)
# print len(wv.syn0), wv.syn0.shape
# print len(wv.syn0norm) if wv.syn0norm else None
fp = np.memmap(cache_file, dtype=np.double, mode='w+', shape=wv.syn0.shape)
fp[:] = wv.syn0[:]
with open(cache_file + '.vocab', 'w', encoding='utf-8') as f:
logger.info('writing out vocab for {}'.format(word_embeddings_file))
for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
print(w, file=f)
with open(cache_file + '.dimensions', 'w', encoding='utf-8') as f:
logger.info('writing out dimensions for {}'.format(word_embeddings_file))
print(wv.syn0.shape[0], wv.syn0.shape[1], file=f)
vocab_size, vec_dim = wv.syn0.shape
del fp, wv
print('cached {} into {}'.format(word_embeddings_file, cache_file))
return vocab_size, vec_dim
def convert_embeddings(glove_input_file, word2vec_output_file,
embeddings_path='embeddings.npz',
vocab_path='map.json'):
"""
Generate embeddings from a batch of text
:param embeddings_path: where to save the embeddings
:param vocab_path: where to save the word-index map
"""
glove2word2vec(glove_input_file, word2vec_output_file)
model = KeyedVectors.load_word2vec_format(word2vec_output_file)
weights = model.syn0
np.save(open(embeddings_path, 'wb'), weights)
vocab = dict([(k, v.index) for k, v in model.vocab.items()])
with open(vocab_path, 'w') as f:
f.write(json.dumps(vocab))