Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
argument = sys.argv[1]
filename = argument.split('/')[-1]
args = filename.split('.')[0].split('__')
(urlhash,algo,vectorsize,windowsize) = args
if algo == "skipgram":
skipgram = 1
else:
skipgram = 0
data = gensim.models.word2vec.LineSentence(argument)
model = gensim.models.Word2Vec(data, size=int(vectorsize), min_count=2, window=int(windowsize), sg=skipgram, workers=2, iter=5, cbow_mean=1)
model.init_sims(replace=True)
model.save_word2vec_format(root+'/trained/'+filename.split('.')[0].split('__')[0]+'.model', binary=True)
os.remove(root+'/tmp/'+filename.split('.')[0].split('__')[0])
def train_model(lst_sentence, path_model, min_count_p = 5, workers_p = 4, size_p = 200, window_p = 5):
model = gensim.models.Word2Vec(lst_sentence, min_count = min_count_p, workers = window_p, size = size_p, window = window_p, cbow_mean = 0)
model.save(path_model)
'''
def word2vec_model(data, size, min_c):
w2c_model = Word2Vec(size=size, min_count=min_c, workers=cpu_count())
w2c_model.build_vocab(data)
w2c_model.train(data)
return w2c_model
def read_corpus(path_to_corpus, output_path, min_count=10, size=500, window=10):
workers = multiprocessing.cpu_count()
sentences = gensim.models.word2vec.LineSentence(path_to_corpus)
model = gensim.models.Word2Vec(sentences, min_count=min_count, size=size,
window=window, sg=1, workers=workers)
model.save(output_path)
def main(args):
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
start = time()
save_dir = args.path
if not os.path.exists(save_dir):
os.makedirs(save_dir)
sentences = Sentences(args.data)
model = gensim.models.Word2Vec(size=args.dim, min_count=5, workers=16, sg=1)
model.build_vocab(sentences)
print('vocab built in {}'.format(timedelta(seconds=time()-start)))
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
model.save(os.path.join(save_dir, 'word2vec.{}d.{}k.bin'.format(args.dim, len(model.wv.vocab)//1000)))
model.wv.save_word2vec_format(os.path.join(save_dir,
'word2vec.{}d.{}k.w2v'.format(args.dim, len(model.wv.vocab)//1000)
))
print('word2vec trained in {}'.format(timedelta(seconds=time()-start)))
else:
logger.info('training phrase model')
# use LineSetence to stream text as oppose to loading it all into memory
unigram_sentences = LineSentence(UNIGRAM_PATH)
phrase_model = Phrases(unigram_sentences)
phrase_model.save(PHRASE_MODEL_CHECKPOINT)
if not os.path.exists(BIGRAM_PATH):
logger.info('converting words to phrases')
export_bigrams(UNIGRAM_PATH, BIGRAM_PATH, phrase_model)
if os.path.exists(WORD2VEC_CHECKPOINT):
word2vec = Word2Vec.load(WORD2VEC_CHECKPOINT)
else:
logger.info('training word2vec')
word2vec = Word2Vec(corpus_file=BIGRAM_PATH, workers=cpu_count())
word2vec.save(WORD2VEC_CHECKPOINT)
logger.info('job completed')
def makeFeatureVec(words, model, num_features):
featureVec = np.zeros((num_features,),dtype="float32")
nwords = 0.
for word in words:
if word in index2word_set:
nwords = nwords + 1.
featureVec = np.add(featureVec,model[word])
if agg_mean :
featureVec = np.divide(featureVec,nwords)
return featureVec
if modelformat == "gensim":
model = gensim.models.Word2Vec.load(modelfile)
elif modelformat == "word2vec-text":
model = gensim.models.Word2Vec.load_word2vec_format(modelfile, binary=False)
elif modelformat == "word2vec-binary":
model = gensim.models.Word2Vec.load_word2vec_format(modelfile, binary=True)
else:
raise Exception("Unknown model format: %s" % modelformat)
index2word_set = set(model.wv.vocab)
word2vecdim = model.wv.syn0.shape[1]
if keep_all_cols :
myschema = [val for val in input_text_dataset.read_schema()]
else :
myschema = []
for i in range(word2vecdim) :
myschema.append({"name": "word2vec_" + str(i),"type": "float"})
output_text_dataset.write_schema(myschema)
mywriter = output_text_dataset.get_writer()
def train(model_file):
contexts = ContextCorpus(data_obj)
#model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-3, hs=1, window = 5) #a1
#model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=3, sg=0, size = 300, sample=1e-5, hs=0, window = 5) #a2
#model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=5, sg=0, size = 300, sample=1e-3, hs=1, window = 5) #a3
#model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-3, hs=0, window = 5) #a4
#model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-5, hs=0, window = 5) #a5
model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=3, sg=0, size = 300, sample=1e-4, hs=1, window = 5) #a6
# ./word2vec -train train100B.txt -read-vocab voc -output vectors.bin -cbow 1 -size 300 -window 5 -negative 3 -hs 0 -sample 1e-5 -threads 12 -binary 1 -min-count 10
model.init_sims(replace=True)
model.save(model_file)
def load_embedding(self):
"""
Load embedding matrix and word count from gensim object
:return: nothing
"""
# Loading gensim object
logging.debug("-> Loading gensim file")
gensim_obj = gensim.models.Word2Vec.load(self.embedding_file_path)
# Copying gensim object embedding matrix
logging.debug("-> Fetching embedding matrix from gensim model")
self.embedding_matrix = gensim_obj.wv.syn0
logging.debug("-> Matrix dimension: {}".format(self.embedding_matrix.shape))
# Creating token-id mapping
logging.debug("-> Creating word-id mapping")
for i, item in enumerate(gensim_obj.wv.index2word, start=1):
self.word_mapping[item] = i
logging.debug("-> Creating padding vector (index=0)")
pad_vector = np.random.rand(1, self.embedding_matrix.shape[1])
self.embedding_matrix = np.insert(self.embedding_matrix, 0, pad_vector, axis=0)
self.word_mapping["pad_token"] = 0
import gensim
import gensim.models.word2vec
model2 = gensim.models.Word2Vec.load("../models/model_win_2")
model = gensim.models.Word2Vec.load("../models/model")
word_list = [
'gross',
'dirty',
'location',
'breakfast',
'smelly',
'affordable',
'hotel staff',
'manager rude',
'complimentary',
'family',
'awe',
'shocked',