Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
.. codeauthor: svitlana vakulenko
Acknowledgements:
* fastText https://pypi.python.org/pypi/fasttext
Test scripts for Python port of FastText
'''
import fasttext
# EMBEDDINGS_MODEL_PATH = '../fastText/result/fil9.bin'
EMBEDDINGS_MODEL_PATH = 'embeddings/fil9.bin'
# print "Loading model from", EMBEDDINGS_MODEL_PATH
model = fasttext.load_model(EMBEDDINGS_MODEL_PATH)
# print "Finished loading"
print len(model.words) # number of words in dictionary
print model['king'] # get the vector of the word 'king'
print model['kingserwq'] # get the vector for an OOV word
def test_tokenize(self):
tokens = fasttext.FastText.tokenize("Hello World")
self.assertEqual(["Hello", "World"], tokens)
from keras.preprocessing import sequence
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers.embeddings import Embedding
from random import random
from numpy import array
from numpy import cumsum
from keras.layers import TimeDistributed
from keras.layers import Bidirectional
from keras.callbacks import ModelCheckpoint
from keras.layers.normalization import BatchNormalization
import fasttext
import re
dic_kor = fasttext.load_model('vectors/model_kor.bin')
def loadvector(File):
f = open(File,'r')
model = {}
for line in f:
splitLine = line.split()
word = splitLine[0]
embedding = np.array([float(val) for val in splitLine[1:]])
model[word] = embedding
return model
dic_eng = loadvector('vectors/model_eng.txt')
import string
idchar = {}
for i in range(len(string.ascii_lowercase)):
idchar.update({string.ascii_lowercase[i]:i})
import sys
def read_data(filename):
with open(filename, 'r') as f:
data = [line.split('\t') for line in f.read().splitlines()]
return data
print('\n\n\n\n\n\n\n\n\n\n\n')
print('#########################################################\n# #\n# Demonstration: Contextual Spacing 4 Korean #\n# #\n#########################################################')
import fasttext
print('\nImporting dictionaries...')
model_drama = fasttext.load_model('vectors/model_drama.bin')
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.1
set_session(tf.Session(config=config))
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, LSTM, GRU, SimpleRNN, Dense, Lambda
import keras.backend as K
from keras.callbacks import ModelCheckpoint
import keras.layers as layers
from keras import optimizers
adam_half = optimizers.Adam(lr=0.0005)
def fit(self, documents,
alg='cbow', min_count=5, size=300, max_features=10000, window=5):
assert alg in ('cbow', 'sg')
if self.flavor == 'w2v':
alg = 0 if alg == 'cbow' else 1
self.model = Word2Vec(
documents, min_count=min_count, size=size, window=window,
max_vocab_size=max_features, sg=alg)
self.model.save(self.path)
elif self.flavor == 'ft':
func = fasttext.cbow if alg == 'cbow' else fasttext.skipgram
with open('/tmp/skiptrain.txt', 'w') as f:
for d in documents:
f.write(' '.join(d) + '\n')
self.model = func(
input_file='/tmp/skiptrain.txt', output=self.path,
min_count=min_count, dim=size, ws=window)
self.size = size
self.default = np.zeros(self.size, dtype='float64')
self.fitted = True
return self
imdb_train, imdb_test = prenlp.data.IMDB()
# Preprocessing
tokenizer = NLTKMosesTokenizer()
for dataset in [imdb_train, imdb_test]:
for i, (text, label) in enumerate(dataset):
dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip()))) # both
# dataset[i][0] = text.strip() # original
# dataset[i][0] = normalizer.normalize(text.strip()) # only normalization
# dataset[i][0] = ' '.join(tokenizer(text.strip())) # only tokenization
prenlp.data.fasttext_transform(imdb_train, 'imdb.train')
prenlp.data.fasttext_transform(imdb_test, 'imdb.test')
# Train
model = fasttext.train_supervised(input='imdb.train', epoch=25)
# Evaluate
print(model.test('imdb.train'))
print(model.test('imdb.test'))
# Inference
print(imdb_test[0][0])
print(model.predict(imdb_test[0][0]))
for text in wikitexko:
writer.write(normalizer.normalize(text.strip())+'\n')
# Preprocessing
tokenizer = SentencePiece()
tokenizer.train(input=corpus_path, model_prefix='sentencepiece', vocab_size=VOCAB_SIZE)
tokenizer.load('sentencepiece.model')
for dataset in [nsmc_train, nsmc_test]:
for i, (text, label) in enumerate(dataset):
dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip())))
prenlp.data.fasttext_transform(nsmc_train, 'nsmc.train')
prenlp.data.fasttext_transform(nsmc_test, 'nsmc.test')
# Train
model = fasttext.train_supervised(input='nsmc.train', epoch=25)
# Evaluate
print(model.test('nsmc.train'))
print(model.test('nsmc.test'))
# Inference
print(nsmc_test[0][0])
print(model.predict(nsmc_test[0][0]))
nsmc_train, nsmc_test = prenlp.data.NSMC()
# Preprocessing
tokenizer = Mecab()
for dataset in [nsmc_train, nsmc_test]:
for i, (text, label) in enumerate(dataset):
dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip()))) # both
# dataset[i][0] = text.strip() # original
# dataset[i][0] = normalizer.normalize(text.strip()) # only normalization
# dataset[i][0] = ' '.join(tokenizer(text.strip())) # only tokenization
prenlp.data.fasttext_transform(nsmc_train, 'nsmc.train')
prenlp.data.fasttext_transform(nsmc_test, 'nsmc.test')
# Train
model = fasttext.train_supervised(input='nsmc.train', epoch=25)
# Evaluate
print(model.test('nsmc.train'))
print(model.test('nsmc.test'))
# Inference
print(nsmc_test[0][0])
print(model.predict(nsmc_test[0][0]))
for text in dataset:
writer.write(normalizer.normalize(text.strip())+'\n')
# Preprocessing
tokenizer = SentencePiece()
tokenizer.train(input=corpus_path, model_prefix='sentencepiece', vocab_size=VOCAB_SIZE)
tokenizer.load('sentencepiece.model')
for dataset in [imdb_train, imdb_test]:
for i, (text, label) in enumerate(dataset):
dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip())))
prenlp.data.fasttext_transform(imdb_train, 'imdb.train')
prenlp.data.fasttext_transform(imdb_test, 'imdb.test')
# Train
model = fasttext.train_supervised(input='imdb.train', epoch=25)
# Evaluate
print(model.test('imdb.train'))
print(model.test('imdb.test'))
# Inference
print(imdb_test[0][0])
print(model.predict(imdb_test[0][0]))