Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def train_word2vec_model(input: str, output_directory: str, model_name: str) -> None:
if not os.access(output_directory, os.W_OK):
print("Cannot write to directory {}. Exiting!".format(output_directory))
exit(1)
if os.path.isdir(input):
sentences = gensim.models.word2vec.PathLineSentences(input)
else:
sentences = gensim.models.word2vec.LineSentence(input)
model = gensim.models.Word2Vec(sentences, sg=0, size=100, window=10, min_count=20, workers=10)
model.train(sentences, total_examples=model.corpus_count, epochs=10)
model.save(output_directory + model_name)
# We want the vectors only to reduce memory footprint: this is the file(s) that the oneline lexicon should use.
vectors = model.wv
vectors.save(output_directory + model_name + ".vectors-only")
else:
params["scoreperclass"] = True
if "word_norm" not in params.keys():
params["word_norm"] = 1
if "oov_random" not in params.keys():
params["oov_random"] = 0
if "emb_model" in params.keys():
emb_models = []
print("===> use pre-trained embeddings...")
model_str = params["emb_model"].split(',')
for m_s in model_str:
gensimFormat = ".gensim" in m_s
if gensimFormat:
emb_models.append(gensim.models.KeyedVectors.load(m_s, mmap='r'))
else:
emb_models.append(gensim.models.KeyedVectors. \
load_word2vec_format(m_s, binary=True))
print("<===loaded {} models".format(len(emb_models)))
if "emb_dim" in params.keys():
emb_dim = int(params["emb_dim"])
if "gpu" in params.keys():
if params["gpu"] == "1":
print("using gpu...")
else:
print("using cpu...")
if "wdist" in params.keys():
wdist_file = params["wdist"]
else:
wdist_file = None
use_mixed_data=False
def load_embeddings_gensim(embeddings_config, label, vocabulary, save_to):
# create a weight matrix for entities in training docs
embedding_matrix = np.zeros((len(vocabulary), embeddings_config['dims']))
# load embeddings binary model with gensim for word2vec and rdf2vec embeddings
model = gensim.models.Word2Vec.load(embeddings_config['path'])
#model = gensim.models.KeyedVectors.load_word2vec_format(embeddings_config['path'], binary=True)
embedded_entities = model.wv
missing = 0
for entity, entity_id in vocabulary.items():
# strip entity label format to rdf2vec label format
#rdf2vec_entity_label = 'dbr:%s' % entity.split('/')[-1]
#print rdf2vec_entity_label
rdf2vec_entity_label = '<' + entity + '>'
if rdf2vec_entity_label in embedded_entities:
embedding_matrix[entity_id] = embedded_entities[rdf2vec_entity_label]
else:
missing += 1
print "done loading gensim entities. %d missing" % missing
# save embedding_matrix for entities in the training dataset
np.save(save_to, embedding_matrix)
# print embedding_matrix
print "third vocab"
#st conc pt conc pd conc br conc mr vocab w/o pars
t3 = list()
for i in range(len(st)):
p = st1[i].split()+pt1[i].split()+pd1[i].split()+br1[i].split()+mr1[i].split()+ab1[i].split()+at1[i].split()
t3.append(p)
print "fourth vocab"
#trin models
model0 = gensim.models.Word2Vec(t, sg=1, window=10, sample=1e-5, negative=5, size=300)
model1 = gensim.models.Word2Vec(t1, sg=1, window=10, sample=1e-5, negative=5, size=300)
model2 = gensim.models.Word2Vec(t2, sg=1, window=10, sample=1e-5, negative=5, size=300)
model3 = gensim.models.Word2Vec(t3, sg=1, window=10, sample=1e-5, negative=5, size=300)
#model4 = gensim.models.Word2Vec(t, sg=0, hs=1, window=10, size=300)
#model5 = gensim.models.Word2Vec(t1, sg=0, hs=1,window=10, size=300)
#model6 = gensim.models.Word2Vec(t2, sg=0, hs=1, window=10, size=300)
#model7 = gensim.models.Word2Vec(t3, sg=0, hs=1,window=10, size=300)
print "model prepared"
#for each model calculate features^ n_similarity between st and something else
model_list=[model0,model1,model2,model3] #,model4 ,model5,model6,model7]
n_sim=list()
for model in model_list:
n_sim_pt=list()
for i in range(len(st)):
CNP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, k, r)), dtype=theano.config.floatX)
# @UndefinedVariable
self.C = theano.shared(value=CNP, name='C')
# self.C = theano.printing.Print("C = ")(self.C)
# Selectional Preferences
Ca1NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
Ca2NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
self.C1 = theano.shared(value=Ca1NP, name='C1')
self.C2 = theano.shared(value=Ca2NP, name='C2')
# argument embeddings
ANP = np.asarray(rng.uniform(-0.01, 0.01, size=(a, k)), dtype=theano.config.floatX) # @UndefinedVariable
if ex_emb:
import gensim
external_embeddings = gensim.models.Word2Vec.load(settings.external_embeddings_path)
for idArg in xrange(self.a):
arg = data.id2Arg[idArg].lower().split(' ')
new = np.zeros(k, dtype=theano.config.floatX)
size = 0
for ar in arg:
if ar in external_embeddings:
new += external_embeddings[ar]
size += 1
if size > 0:
ANP[idArg] = new/size
self.A = theano.shared(value=ANP, name='A') # (a1, k)
self.Ab = theano.shared(value=np.zeros(a, dtype=theano.config.floatX), # @UndefinedVariable
name='Ab', borrow=True)
from gensim import models
from .topics import GensimWrapper
models.LsiModel.update = models.LsiModel.add_documents
models.LsiModel.add_documents = lambda self, *args, **kwargs: self.update(*args, **kwargs)
class LsiWrapper(GensimWrapper):
name = 'Latent Semantic Indexing'
Model = models.LsiModel
has_negative_weights = True
'workers': max(
1,
multiprocessing.cpu_count() -
1),
'sample': 1E-3,
}
all_sentences = []
for law in self.laws.values():
for article in law.sentences.keys():
for par in law.sentences[article]:
for per in law.sentences[article][par]:
all_sentences.append(per)
self.model = gensim.models.Word2Vec(all_sentences, **params)
print('Model train complete!')
self.model.wv.save_word2vec_format('model')
return self.model
def sampling_traning(self):
# SGNS and suggested parameters to be tuned: size, window, negative, workers, seed
# to tune other parameters, please read https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
w2v = gensim.models.Word2Vec(sentences=None, size=self.emb_dim, window=self.window, sg=1, hs=0, negative=self.negative, ns_exponent=0.75,
alpha=0.025, min_alpha=0.0001, min_count=1, sample=0.001, iter=4, workers=self.workers, seed=self.seed,
corpus_file=None, sorted_vocab=1, batch_words=10000, compute_loss=False,
max_vocab_size=None, max_final_vocab=None, trim_rule=None) # w2v constructor, default parameters
for t in range(len(self.G_dynamic)):
t1 = time.time()
if t == 0: # offline ==============================
G0 = self.G_dynamic[t] # initial graph
sentences = simulate_walks(nx_graph=G0, num_walks=self.num_walks, walk_length=self.walk_length)
sentences = [[str(j) for j in i] for i in sentences]
w2v.build_vocab(sentences=sentences, update=False) # init traning, so update False
w2v.train(sentences=sentences, total_examples=w2v.corpus_count, epochs=w2v.iter) # follow w2v constructor
else: # incremental adapting ==============================
G0 = self.G_dynamic[t-1] # previous graph
G1 = self.G_dynamic[t] # current graph
def get_word_vectors(self, vocab):
"""
加载词向量,并获得相应的词向量矩阵
:param vocab: 训练集所含有的单词
:return:
"""
word_vectors = (1 / np.sqrt(len(vocab)) * (2 * np.random.rand(len(vocab), self._embedding_size) - 1))
if os.path.splitext(self._word_vectors_path)[-1] == ".bin":
word_vec = gensim.models.KeyedVectors.load_word2vec_format(self._word_vectors_path, binary=True)
else:
word_vec = gensim.models.KeyedVectors.load_word2vec_format(self._word_vectors_path)
for i in range(len(vocab)):
try:
vector = word_vec.wv[vocab[i]]
word_vectors[i, :] = vector
except:
print(vocab[i] + "不存在于字向量中")
return word_vectors