Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
.. codeauthor: svitlana vakulenko
Acknowledgements:
* fastText https://pypi.python.org/pypi/fasttext
Test scripts for Python port of FastText
'''
import fasttext
# EMBEDDINGS_MODEL_PATH = '../fastText/result/fil9.bin'
EMBEDDINGS_MODEL_PATH = 'embeddings/fil9.bin'
# print "Loading model from", EMBEDDINGS_MODEL_PATH
model = fasttext.load_model(EMBEDDINGS_MODEL_PATH)
# print "Finished loading"
print len(model.words) # number of words in dictionary
print model['king'] # get the vector of the word 'king'
print model['kingserwq'] # get the vector for an OOV word
from keras.preprocessing import sequence
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers.embeddings import Embedding
from random import random
from numpy import array
from numpy import cumsum
from keras.layers import TimeDistributed
from keras.layers import Bidirectional
from keras.callbacks import ModelCheckpoint
from keras.layers.normalization import BatchNormalization
import fasttext
import re
dic_kor = fasttext.load_model('vectors/model_kor.bin')
def loadvector(File):
f = open(File,'r')
model = {}
for line in f:
splitLine = line.split()
word = splitLine[0]
embedding = np.array([float(val) for val in splitLine[1:]])
model[word] = embedding
return model
dic_eng = loadvector('vectors/model_eng.txt')
import string
idchar = {}
for i in range(len(string.ascii_lowercase)):
idchar.update({string.ascii_lowercase[i]:i})
import sys
def read_data(filename):
with open(filename, 'r') as f:
data = [line.split('\t') for line in f.read().splitlines()]
return data
print('\n\n\n\n\n\n\n\n\n\n\n')
print('#########################################################\n# #\n# Demonstration: Contextual Spacing 4 Korean #\n# #\n#########################################################')
import fasttext
print('\nImporting dictionaries...')
model_drama = fasttext.load_model('vectors/model_drama.bin')
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.1
set_session(tf.Session(config=config))
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, LSTM, GRU, SimpleRNN, Dense, Lambda
import keras.backend as K
from keras.callbacks import ModelCheckpoint
import keras.layers as layers
from keras import optimizers
adam_half = optimizers.Adam(lr=0.0005)
def __init__(self):
filepath = join(dirname(__file__), "fasttext.model")
self.estimator = fasttext.load_model(filepath)
if self.opt.get('fasttext_model') is not None:
if not os.path.isfile(self.opt['fasttext_model']):
emb_path = os.environ.get('EMBEDDINGS_URL')
if not emb_path:
raise RuntimeError('No pretrained fasttext model provided')
fname = os.path.basename(opt['fasttext_model'])
try:
print('Trying to download a pretrained fasttext model'
' from the repository')
url = urllib.parse.urljoin(emb_path, fname)
urllib.request.urlretrieve(url, opt['fasttext_model'])
print('Downloaded a fasttext model')
except Exception as e:
raise RuntimeError('Looks like the `EMBEDDINGS_URL` variable'
' is set incorrectly', e)
self.fasttext_model = fasttext.load_model(opt['fasttext_model'])
if self.tok2emb and (self.fasttext_model.dim != self.dim):
raise RuntimeError("Fasttext model and loaded embeddings have"
" different dimension sizes.")
else:
print("No fasttext model provided: using loaded embeddings.")
"""
:param corpusfile: file containing the corpus to sample from
:param language: language from ['en', 'nl']
:param outfile: name of devcorpus file
:param window_size: minimal amount of tokens on each side of a generated misspelling
:param oov: True if the generated misspellings need to be absent from the vector vocabulary
:param samplesize: number of lines to sample
:param editdistance: the type of edit distances generated: 1, 2 or 1 and 2 (80-20 proportion)
"""
# load lexicon
assert language in ['en', 'nl']
with open('lexicon_{}.json'.format(language), 'r') as f:
vocab = set(json.load(f))
# load vector vocab
model = fasttext.load_model('../data/embeddings_{}.bin'.format(language))
vector_vocab = model.words
# load sample
if samplesize:
corpus_sample(corpusfile, 'devsample.json', samplesize)
with open('devsample.json', 'r') as f:
corpus = json.load(f)[0]
else:
with open(corpusfile, 'r') as f:
corpus = json.load(f)[0]
# generate misspellings corpus with corrections and detection contexts
functionlist = [letterswitch, letterdelete, letterinsert, lettersub]
correct_spellings = []
misspellings = []
misspelling_contexts = []
def __init__(self, data_dir, model_file, dataset='sim'):
# specify pre-trained models to load in the interface
# and the sample dataset for evaluation: test sim synth table
self.data_dir = data_dir
self.data_path = './data/%s_data_{}.txt' % dataset
self.model_file = './trained_model/memn2n_table_qa_model_%s.pklz' % dataset
# self.model_file = model_file
self.reversed_dict = None
self.memory = None
self.model = None
self.loss = None
self.general_config = None
# SV load model to embed OOV words
print("Loading word embeddings model")
self.word_model = fasttext.load_model(EMBEDDINGS_MODEL_PATH)
# SV keep word vectors for all the dictionary words
self.dict_vectors = {}
def main():
# arguments
args = parse_args()
if args.log: logging.basicConfig(filename=args.log, format='%(message)s', level=logging.INFO)
else: logging.basicConfig(format='%(message)s', level=logging.INFO)
# x
trn_graphs = read_graphs(args.tsv, args.trn_data)
dev_graphs = read_graphs(args.tsv, args.dev_data)
# lexicon
w2v = KeyedVectors.load_word2vec_format(args.w2v, binary=True) if args.w2v else None
f2v = fasttext.load_model(args.f2v) if args.f2v else None
a2v = KeyedVectors.load_word2vec_format(args.a2v, binary=True) if args.a2v else None
lexicon = POSLexicon(w2v=w2v, f2v=f2v, a2v=a2v, output_size=args.output_size)
# model
model = POSModel(feature_context=args.feature_context, batch_size=64, w2v_dim=100)
model.train(trn_graphs, dev_graphs, lexicon, num_steps=args.num_steps,
bagging_ratio=args.bagging_ratio, optimizer=args.optimizer, force_init=True)
it separately.
"""
output_path = Path(out_dir)
if not output_path.exists():
output_path.mkdir(parents=True)
msg.good(f"Created output directory {out_dir}")
if fasttext_filepath:
msg.info("Loading fastText model vectors from .bin file")
if in_dir:
msg.warn(f"Warning: Providing a fastText filepath overrides fastText vector training")
fasttext_filepath = Path(fasttext_filepath)
if not fasttext_filepath.exists() or not fasttext_filepath.is_file() or not (fasttext_filepath.suffix == '.bin'):
msg.fail("Error: fasttext_filepath expects a fastText model .bin file", exits=1)
fasttext_model = fasttext.load_model(str(fasttext_filepath))
msg.good("Successfully loaded fastText model")
elif in_dir:
msg.info("Training fastText model vectors")
input_path = Path(in_dir)
# Check to see if fasttext_filepath exists
if not input_path.exists() or not input_path.is_dir():
msg.fail("Not a valid input directory", in_dir, exits=1)
tmp_path = input_path / "s2v_input.tmp"
input_files = [p for p in input_path.iterdir() if p.suffix == ".s2v"]
if not input_files:
msg.fail("Input directory contains no .s2v files", in_dir, exits=1)
# fastText expects only one input file and only reads from disk and not
# stdin, so we need to create a temporary file that concatenates the inputs
with tmp_path.open("a", encoding="utf8") as tmp_file:
for input_file in input_files:
with input_file.open("r", encoding="utf8") as f:
def bytes_to_python_fasttext_model(bytes_obj):
if bytes_obj is None:
return None
temp_fpath = temp_model_fpath()
with open(temp_fpath, 'wb+') as bfile:
bfile.write(bytes_obj)
model = load_model(temp_fpath)
os.remove(temp_fpath)
return model