Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_doc2vec_inference_saveload():
tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
for i, doc in enumerate(documents)]
model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10)
model.save(TEST_FILE)
del model
model = Doc2Vec.load(TEST_FILE)
os.remove(TEST_FILE)
d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
match_op = Matching()
retrieval = Retrieval(d2v, matching=match_op).fit(documents)
result = retrieval.query("scientists")
assert result[0] == 1
def similarity_3_contexts(p, t):
(bef, bet, aft) = (0, 0, 0)
if t.bef_vector is not None and p.bef_vector is not None:
bef = dot(matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector))
if t.bet_vector is not None and p.bet_vector is not None:
bet = dot(matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector))
if t.aft_vector is not None and p.aft_vector is not None:
aft = dot(matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector))
return 0*bef + 1*bet + 0*aft
def make_index():
logging.info('loading dictionary')
dictionary = gensim.corpora.Dictionary.load_from_text('svd/dictionary.txt')
logging.info('loading corpus')
corpus = gensim.corpora.MmCorpus('svd/corpus.mm')
tfidf = gensim.models.TfidfModel(corpus)
logging.info('loading model')
model = gensim.models.ldamodel.LdaModel.load('svd/lda.txt')
logging.info('building lda docs')
lda_corpus = model[tfidf[corpus]]
logging.info('building index')
index = gensim.similarities.docsim.Similarity('/tmp/lda_index.txt', lda_corpus, 1000)
index.save('svd/lda_index.txt')
def train_word2vec_model(input: str, output_directory: str, model_name: str) -> None:
if not os.access(output_directory, os.W_OK):
print("Cannot write to directory {}. Exiting!".format(output_directory))
exit(1)
if os.path.isdir(input):
sentences = gensim.models.word2vec.PathLineSentences(input)
else:
sentences = gensim.models.word2vec.LineSentence(input)
model = gensim.models.Word2Vec(sentences, sg=0, size=100, window=10, min_count=20, workers=10)
model.train(sentences, total_examples=model.corpus_count, epochs=10)
model.save(output_directory + model_name)
# We want the vectors only to reduce memory footprint: this is the file(s) that the oneline lexicon should use.
vectors = model.wv
vectors.save(output_directory + model_name + ".vectors-only")
else:
params["scoreperclass"] = True
if "word_norm" not in params.keys():
params["word_norm"] = 1
if "oov_random" not in params.keys():
params["oov_random"] = 0
if "emb_model" in params.keys():
emb_models = []
print("===> use pre-trained embeddings...")
model_str = params["emb_model"].split(',')
for m_s in model_str:
gensimFormat = ".gensim" in m_s
if gensimFormat:
emb_models.append(gensim.models.KeyedVectors.load(m_s, mmap='r'))
else:
emb_models.append(gensim.models.KeyedVectors. \
load_word2vec_format(m_s, binary=True))
print("<===loaded {} models".format(len(emb_models)))
if "emb_dim" in params.keys():
emb_dim = int(params["emb_dim"])
if "gpu" in params.keys():
if params["gpu"] == "1":
print("using gpu...")
else:
print("using cpu...")
if "wdist" in params.keys():
wdist_file = params["wdist"]
else:
wdist_file = None
use_mixed_data=False
def load_embeddings_gensim(embeddings_config, label, vocabulary, save_to):
# create a weight matrix for entities in training docs
embedding_matrix = np.zeros((len(vocabulary), embeddings_config['dims']))
# load embeddings binary model with gensim for word2vec and rdf2vec embeddings
model = gensim.models.Word2Vec.load(embeddings_config['path'])
#model = gensim.models.KeyedVectors.load_word2vec_format(embeddings_config['path'], binary=True)
embedded_entities = model.wv
missing = 0
for entity, entity_id in vocabulary.items():
# strip entity label format to rdf2vec label format
#rdf2vec_entity_label = 'dbr:%s' % entity.split('/')[-1]
#print rdf2vec_entity_label
rdf2vec_entity_label = '<' + entity + '>'
if rdf2vec_entity_label in embedded_entities:
embedding_matrix[entity_id] = embedded_entities[rdf2vec_entity_label]
else:
missing += 1
print "done loading gensim entities. %d missing" % missing
# save embedding_matrix for entities in the training dataset
np.save(save_to, embedding_matrix)
# print embedding_matrix
print "third vocab"
#st conc pt conc pd conc br conc mr vocab w/o pars
t3 = list()
for i in range(len(st)):
p = st1[i].split()+pt1[i].split()+pd1[i].split()+br1[i].split()+mr1[i].split()+ab1[i].split()+at1[i].split()
t3.append(p)
print "fourth vocab"
#trin models
model0 = gensim.models.Word2Vec(t, sg=1, window=10, sample=1e-5, negative=5, size=300)
model1 = gensim.models.Word2Vec(t1, sg=1, window=10, sample=1e-5, negative=5, size=300)
model2 = gensim.models.Word2Vec(t2, sg=1, window=10, sample=1e-5, negative=5, size=300)
model3 = gensim.models.Word2Vec(t3, sg=1, window=10, sample=1e-5, negative=5, size=300)
#model4 = gensim.models.Word2Vec(t, sg=0, hs=1, window=10, size=300)
#model5 = gensim.models.Word2Vec(t1, sg=0, hs=1,window=10, size=300)
#model6 = gensim.models.Word2Vec(t2, sg=0, hs=1, window=10, size=300)
#model7 = gensim.models.Word2Vec(t3, sg=0, hs=1,window=10, size=300)
print "model prepared"
#for each model calculate features^ n_similarity between st and something else
model_list=[model0,model1,model2,model3] #,model4 ,model5,model6,model7]
n_sim=list()
for model in model_list:
n_sim_pt=list()
for i in range(len(st)):
CNP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, k, r)), dtype=theano.config.floatX)
# @UndefinedVariable
self.C = theano.shared(value=CNP, name='C')
# self.C = theano.printing.Print("C = ")(self.C)
# Selectional Preferences
Ca1NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
Ca2NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
self.C1 = theano.shared(value=Ca1NP, name='C1')
self.C2 = theano.shared(value=Ca2NP, name='C2')
# argument embeddings
ANP = np.asarray(rng.uniform(-0.01, 0.01, size=(a, k)), dtype=theano.config.floatX) # @UndefinedVariable
if ex_emb:
import gensim
external_embeddings = gensim.models.Word2Vec.load(settings.external_embeddings_path)
for idArg in xrange(self.a):
arg = data.id2Arg[idArg].lower().split(' ')
new = np.zeros(k, dtype=theano.config.floatX)
size = 0
for ar in arg:
if ar in external_embeddings:
new += external_embeddings[ar]
size += 1
if size > 0:
ANP[idArg] = new/size
self.A = theano.shared(value=ANP, name='A') # (a1, k)
self.Ab = theano.shared(value=np.zeros(a, dtype=theano.config.floatX), # @UndefinedVariable
name='Ab', borrow=True)
argument = sys.argv[1]
filename = argument.split('/')[-1]
args = filename.split('.')[0].split('__')
(urlhash,algo,vectorsize,windowsize) = args
if algo == "skipgram":
skipgram = 1
else:
skipgram = 0
data = gensim.models.word2vec.LineSentence(argument)
model = gensim.models.Word2Vec(data, size=int(vectorsize), min_count=2, window=int(windowsize), sg=skipgram, workers=2, iter=5, cbow_mean=1)
model.init_sims(replace=True)
model.save_word2vec_format(root+'/trained/'+filename.split('.')[0].split('__')[0]+'.model', binary=True)
os.remove(root+'/tmp/'+filename.split('.')[0].split('__')[0])
import os
from collections import OrderedDict
import pandas as pd
from nlpia.data.loaders import get_data, BIGDATA_PATH
from gensim.models import KeyedVectors
word_vectors = get_data('word2vec') # not in book
wordvector_path = os.path.join(BIGDATA_PATH, 'GoogleNews-vectors-negative300.bin.gz') # not in book, reader required to compose this path
if 'word_vectors' not in globals(): # not in book
WV = word_vectors = get_data('word2vec')
word_vectors = KeyedVectors.load_word2vec_format(wordvector_path, binary=True)
###################################################
# Still need to create a class derived from gensim's Word2vec model instead of relying on word_vectors globals
COMPONENT_WORDS = OrderedDict([
('placeness', ('geography Geography geographic geographical geographical_location location ' +
'locale locations proximity').split()),
('peopleness', 'human Humans homo_sapiens peole people individuals humankind people men women'.split()),
('animalness', 'animal mammal carnivore animals Animal animal_welfare dog pet cats ani_mal'.split()),
('conceptness', 'concept concepts idea'.split()),
('femaleness', 'female Female females femal woman girl lady'.split()),
])
def component_vector(words):