Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def compactness_score(model_path, topic_file_path, with_gensim = True):
"""
model_path: Word2Vec model file
topic_file_path:Each line in the file is a topic, represented as
a list of words separated by spaces
Output: Print compactness score for each topic and a final score for all the topics.
"""
# Loading can be very slow if the model is large.
# User should consider loading the model just once for all the topic files.
print("Loading Word2Vec model: " + model_path)
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
print("Loading Done.")
print("Processing topic file: " + topic_file_path)
line_count = 0
result = []
with open(topic_file_path, 'r') as inputfile:
for line in inputfile:
line_count += 1
sims = []
line = line.strip(' \n').split(' ')
print(line),
for i in range(len(line)):
if line[i] not in model.vocab:
continue
Returns:
word_embeddings : Dictionary mapping each word to corresponding embedding
'''
word_embeddings = {}
if w2vfile.endswith('.txt'):
f = open(w2vfile)
for line in tqdm(f):
values = line.split(" ")
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
if word in word_to_index:
word_embeddings[word] = coefs
f.close()
elif w2vfile.endswith('.bin'):
word2vec = KeyedVectors.load_word2vec_format(w2vfile, binary=True, limit=1000000)
for word in tqdm(word_to_index):
try:
word_embeddings[word] = word2vec[word.lower()]
except KeyError:
pass
else:
print ('Can\'t load word embeddings.')
exit(-1)
print('Found {0}/{1} word vectors.'.format(len(word_embeddings), len(word_to_index)))
if len(word_to_index) > len(word_embeddings):
print('Initializing remaining {} word vectors with zeros.'.format(len(word_to_index) - len(word_embeddings)))
for word in word_to_index:
if word not in word_embeddings:
word_embeddings[word] = np.zeros((embedsize,))
def check_oov_of_word_analogies(w2v_format_emb_file, analogy_file, is_vn=True, case_sensitive=True):
emb_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_format_emb_file,
binary=False,
unicode_errors='ignore')
f_reader = open(analogy_file, "r")
vocab_arr = []
for line in f_reader:
if not case_sensitive:
line = line.lower()
if line.startswith(': '):
continue
else:
for word in line.split(" | "):
# In Vietnamese, we have compound and single word.
# if is_vn:
# if " " in word:
def get_word_vectors(self, vocab: List[str]) -> np.ndarray:
"""
load word vector file,
:param vocab: vocab
:return:
"""
pad_vector = np.zeros(self.__embedding_size) # set the "" vector to 0
word_vectors = (1 / np.sqrt(len(vocab) - 1) * (2 * np.random.rand(len(vocab) - 1, self.__embedding_size) - 1))
if os.path.splitext(self.__word_vector_path)[-1] == ".bin":
word_vec = gensim.models.KeyedVectors.load_word2vec_format(self.__word_vector_path, binary=True)
else:
word_vec = gensim.models.KeyedVectors.load_word2vec_format(self.__word_vector_path)
for i in range(1, len(vocab)):
try:
vector = word_vec.wv[vocab[i]]
word_vectors[i, :] = vector
except:
print(vocab[i] + "not exist word vector file")
word_vectors = np.vstack((pad_vector, word_vectors))
return word_vectors
def load(cls, tokens, limit=None):
return KeyedVectors.load_word2vec_format(
cls.path, binary=True, limit=limit)
def _load_model(self, model_file="../data/corpus.model.bin", binary=False):
"""
Load model with C format word2vec file.
"""
if not os.path.exists(model_file):
raise Exception("Model file does not exist.")
model = gensim.models.KeyedVectors.load_word2vec_format(model_file, binary=binary, unicode_errors="ignore")
return model
for i in range(1,n+1):
D[i][0] = D[i-1][0] + 1
for j in range(1,m+1):
D[0][j] = D[0][j-1] + 1
for i in range(1,n+1):
for j in range(1,m+1):
D[i][j] = min(D[i-1][j]+1, D[i-1][j-1]+sub_cost(str1[i-1],str2[j-1]), D[i][j-1]+1)
return D[n][m]
if __name__ == '__main__':
args = parser.parse_args()
pp.pprint(args)
# Load the training data
vecfile = 'GoogleNews-vectors-negative300.bin'
vecs = KeyedVectors.load_word2vec_format(vecfile, binary=True)
with open(args.inputfile, 'r') as inputfile:
input = inputfile.readlines()
output_simple = open("pred_simple.txt", "w")
output_w2v = open("pred_ex1.txt", "w")
with open("data/en-train.txt", 'r') as train_file:
train_input = train_file.readlines()
with open("data/en-val.txt", 'r') as val_file:
val = val_file.readlines()
with open("data/en-test.txt", 'r') as test_file:
test = test_file.readlines()
def run(vectors_fpath, output_fpath="", only_letters=False, vocab_limit=None, pairs=False, batch_size=1000, threads_num=4, word_freqs=None):
print("Vectors: {}, only_letters: {}".format(vectors_fpath, only_letters), file=stderr)
print("Loading vectors from {}".format(vectors_fpath), file=stderr)
tic = time()
vectors = gensim.models.KeyedVectors.load_word2vec_format(
vectors_fpath, binary=False, unicode_errors='ignore')
vectors.init_sims(replace=True)
print("Vectors loaded in %d sec." % (time()-tic), file=stderr)
print("Vectors shape is: ", vectors.syn0norm.shape, file=stderr)
vocab_size = len(vectors.vocab)
print(("Vocabulary size: %i" % vocab_size))
# Limit the number of words for which to collect neighbours
if vocab_limit and vocab_limit < vocab_size:
vocab_size = vocab_limit
words = vectors.index2word[:vocab_size]
print(("Collect neighbours for %i most frequent words" % vocab_size))
try:
model = FastText.load_fasttext_format(embedding_weights_path)
pre_trained_embedding = "bin"
except:
print ("fastText binary file (.bin) is not found!")
if os.path.exists("./Word_embedding/wiki.en.vec"):
print ("Using wikipedia(en) pre-trained word vectors.")
else:
print ("Downloading wikipedia(en) pre-trained word vectors.")
chakin.download(number=2, save_dir="./Word_embedding")
print ("Loading vectors...")
if os.path.exists("./Word_embedding_model.pkl"):
with open("./Word_embedding_model.pkl", mode="rb") as f:
model = pickle.load(f)
else:
model = KeyedVectors.load_word2vec_format('./Word_embedding/wiki.en.vec')
with open("Word_embedding_model.pkl", mode="wb") as f:
pickle.dump(model, f)
pre_trained_embedding = "txt"
vocab_size = len(words_map)
word_dimension = model['a'].shape[0]
w = np.zeros((vocab_size,word_dimension),dtype=np.float32)
for k,v in words_map.items():
word = k
word_number = v
try:
w[word_number][:] = model[word]
except KeyError as e:
if pre_trained_embedding == "bin":
def main():
# arguments
args = parse_args()
if args.log: logging.basicConfig(filename=args.log, format='%(message)s', level=logging.INFO)
else: logging.basicConfig(format='%(message)s', level=logging.INFO)
# x
trn_graphs = read_graphs(args.tsv, args.trn_data)
dev_graphs = read_graphs(args.tsv, args.dev_data)
# lexicon
w2v = KeyedVectors.load_word2vec_format(args.w2v, binary=True) if args.w2v else None
f2v = fasttext.load_model(args.f2v) if args.f2v else None
a2v = KeyedVectors.load_word2vec_format(args.a2v, binary=True) if args.a2v else None
lexicon = POSLexicon(w2v=w2v, f2v=f2v, a2v=a2v, output_size=args.output_size)
# model
model = POSModel(feature_context=args.feature_context, batch_size=64, w2v_dim=100)
model.train(trn_graphs, dev_graphs, lexicon, num_steps=args.num_steps,
bagging_ratio=args.bagging_ratio, optimizer=args.optimizer, force_init=True)