How to use fasttext - 10 common examples

To help you get started, we’ve selected a few fasttext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github svakulenk0 / MemN2N-tableQA / test_fasttext.py View on Github external
.. codeauthor: svitlana vakulenko
    

Acknowledgements: 
* fastText https://pypi.python.org/pypi/fasttext

Test scripts for Python port of FastText
'''

import fasttext

# EMBEDDINGS_MODEL_PATH = '../fastText/result/fil9.bin'
EMBEDDINGS_MODEL_PATH = 'embeddings/fil9.bin'
# print "Loading model from", EMBEDDINGS_MODEL_PATH
model = fasttext.load_model(EMBEDDINGS_MODEL_PATH)
# print "Finished loading"

print len(model.words) # number of words in dictionary
print model['king'] # get the vector of the word 'king'
print model['kingserwq'] # get the vector for an OOV word
github Kaggle / docker-python / tests / test_fasttext.py View on Github external
def test_tokenize(self):
        tokens = fasttext.FastText.tokenize("Hello World")

        self.assertEqual(["Hello", "World"], tokens)
github warnikchow / raws / raws.py View on Github external
from keras.preprocessing import sequence
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers.embeddings import Embedding

from random import random
from numpy import array
from numpy import cumsum
from keras.layers import TimeDistributed
from keras.layers import Bidirectional
from keras.callbacks import ModelCheckpoint
from keras.layers.normalization import BatchNormalization

import fasttext
import re

dic_kor = fasttext.load_model('vectors/model_kor.bin')
def loadvector(File):
    f = open(File,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    return model
dic_eng = loadvector('vectors/model_eng.txt')

import string
idchar = {}
for i in range(len(string.ascii_lowercase)):
  idchar.update({string.ascii_lowercase[i]:i})
github warnikchow / ttuyssubot / csct.py View on Github external
import sys

def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
    return data

print('\n\n\n\n\n\n\n\n\n\n\n')

print('#########################################################\n#                                                       #\n#       Demonstration: Contextual Spacing 4 Korean      #\n#                                                       #\n#########################################################')	
	
import fasttext

print('\nImporting dictionaries...')

model_drama = fasttext.load_model('vectors/model_drama.bin')

import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.1
set_session(tf.Session(config=config))

from keras.models import Sequential, Model
from keras.layers import Input, Embedding, LSTM, GRU, SimpleRNN, Dense, Lambda
import keras.backend as K
from keras.callbacks import ModelCheckpoint
import keras.layers as layers

from keras import optimizers
adam_half = optimizers.Adam(lr=0.0005)
github emanjavacas / seqmod / scripts / w2v.py View on Github external
def fit(self, documents,
            alg='cbow', min_count=5, size=300, max_features=10000, window=5):

        assert alg in ('cbow', 'sg')

        if self.flavor == 'w2v':
            alg = 0 if alg == 'cbow' else 1
            self.model = Word2Vec(
                documents, min_count=min_count, size=size, window=window,
                max_vocab_size=max_features, sg=alg)
            self.model.save(self.path)
        elif self.flavor == 'ft':
            func = fasttext.cbow if alg == 'cbow' else fasttext.skipgram
            with open('/tmp/skiptrain.txt', 'w') as f:
                for d in documents:
                    f.write(' '.join(d) + '\n')
            self.model = func(
                input_file='/tmp/skiptrain.txt', output=self.path,
                min_count=min_count, dim=size, ws=window)

        self.size = size
        self.default = np.zeros(self.size, dtype='float64')
        self.fitted = True

        return self
github lyeoni / prenlp / examples / fasttext_imdb.py View on Github external
imdb_train, imdb_test = prenlp.data.IMDB()

# Preprocessing
tokenizer = NLTKMosesTokenizer()
for dataset in [imdb_train, imdb_test]:
    for i, (text, label) in enumerate(dataset):
        dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip()))) # both
        # dataset[i][0] = text.strip() # original
        # dataset[i][0] = normalizer.normalize(text.strip()) # only normalization
        # dataset[i][0] = ' '.join(tokenizer(text.strip())) # only tokenization

prenlp.data.fasttext_transform(imdb_train, 'imdb.train')
prenlp.data.fasttext_transform(imdb_test, 'imdb.test')
         
# Train
model = fasttext.train_supervised(input='imdb.train', epoch=25)

# Evaluate
print(model.test('imdb.train'))
print(model.test('imdb.test'))

# Inference
print(imdb_test[0][0])
print(model.predict(imdb_test[0][0]))
github lyeoni / prenlp / examples / fasttext_nsmc_sentencepiece.py View on Github external
for text in wikitexko:
        writer.write(normalizer.normalize(text.strip())+'\n')

# Preprocessing
tokenizer = SentencePiece()
tokenizer.train(input=corpus_path, model_prefix='sentencepiece', vocab_size=VOCAB_SIZE)
tokenizer.load('sentencepiece.model')
for dataset in [nsmc_train, nsmc_test]:
    for i, (text, label) in enumerate(dataset):
        dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip())))

prenlp.data.fasttext_transform(nsmc_train, 'nsmc.train')
prenlp.data.fasttext_transform(nsmc_test, 'nsmc.test')
         
# Train
model = fasttext.train_supervised(input='nsmc.train', epoch=25)

# Evaluate
print(model.test('nsmc.train'))
print(model.test('nsmc.test'))

# Inference
print(nsmc_test[0][0])
print(model.predict(nsmc_test[0][0]))
github lyeoni / prenlp / examples / fasttext_nsmc.py View on Github external
nsmc_train, nsmc_test = prenlp.data.NSMC()

# Preprocessing
tokenizer = Mecab()
for dataset in [nsmc_train, nsmc_test]:
    for i, (text, label) in enumerate(dataset):
        dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip()))) # both
        # dataset[i][0] = text.strip() # original
        # dataset[i][0] = normalizer.normalize(text.strip()) # only normalization
        # dataset[i][0] = ' '.join(tokenizer(text.strip())) # only tokenization

prenlp.data.fasttext_transform(nsmc_train, 'nsmc.train')
prenlp.data.fasttext_transform(nsmc_test, 'nsmc.test')
         
# Train
model = fasttext.train_supervised(input='nsmc.train', epoch=25)

# Evaluate
print(model.test('nsmc.train'))
print(model.test('nsmc.test'))

# Inference
print(nsmc_test[0][0])
print(model.predict(nsmc_test[0][0]))
github lyeoni / prenlp / examples / fasttext_imdb_sentencepiece.py View on Github external
for text in dataset:
            writer.write(normalizer.normalize(text.strip())+'\n')

# Preprocessing
tokenizer = SentencePiece()
tokenizer.train(input=corpus_path, model_prefix='sentencepiece', vocab_size=VOCAB_SIZE)
tokenizer.load('sentencepiece.model')
for dataset in [imdb_train, imdb_test]:
    for i, (text, label) in enumerate(dataset):
        dataset[i][0] = ' '.join(tokenizer(normalizer.normalize(text.strip())))

prenlp.data.fasttext_transform(imdb_train, 'imdb.train')
prenlp.data.fasttext_transform(imdb_test, 'imdb.test')
         
# Train
model = fasttext.train_supervised(input='imdb.train', epoch=25)

# Evaluate
print(model.test('imdb.train'))
print(model.test('imdb.test'))

# Inference
print(imdb_test[0][0])
print(model.predict(imdb_test[0][0]))