How to use the fasttext.load_model function in fasttext

To help you get started, we’ve selected a few fasttext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github svakulenk0 / MemN2N-tableQA / test_fasttext.py View on Github external
.. codeauthor: svitlana vakulenko
    

Acknowledgements: 
* fastText https://pypi.python.org/pypi/fasttext

Test scripts for Python port of FastText
'''

import fasttext

# EMBEDDINGS_MODEL_PATH = '../fastText/result/fil9.bin'
EMBEDDINGS_MODEL_PATH = 'embeddings/fil9.bin'
# print "Loading model from", EMBEDDINGS_MODEL_PATH
model = fasttext.load_model(EMBEDDINGS_MODEL_PATH)
# print "Finished loading"

print len(model.words) # number of words in dictionary
print model['king'] # get the vector of the word 'king'
print model['kingserwq'] # get the vector for an OOV word
github warnikchow / raws / raws.py View on Github external
from keras.preprocessing import sequence
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers.embeddings import Embedding

from random import random
from numpy import array
from numpy import cumsum
from keras.layers import TimeDistributed
from keras.layers import Bidirectional
from keras.callbacks import ModelCheckpoint
from keras.layers.normalization import BatchNormalization

import fasttext
import re

dic_kor = fasttext.load_model('vectors/model_kor.bin')
def loadvector(File):
    f = open(File,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    return model
dic_eng = loadvector('vectors/model_eng.txt')

import string
idchar = {}
for i in range(len(string.ascii_lowercase)):
  idchar.update({string.ascii_lowercase[i]:i})
github warnikchow / ttuyssubot / csct.py View on Github external
import sys

def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
    return data

print('\n\n\n\n\n\n\n\n\n\n\n')

print('#########################################################\n#                                                       #\n#       Demonstration: Contextual Spacing 4 Korean      #\n#                                                       #\n#########################################################')	
	
import fasttext

print('\nImporting dictionaries...')

model_drama = fasttext.load_model('vectors/model_drama.bin')

import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.1
set_session(tf.Session(config=config))

from keras.models import Sequential, Model
from keras.layers import Input, Embedding, LSTM, GRU, SimpleRNN, Dense, Lambda
import keras.backend as K
from keras.callbacks import ModelCheckpoint
import keras.layers as layers

from keras import optimizers
adam_half = optimizers.Adam(lr=0.0005)
github undertheseanlp / underthesea / underthesea / classification / model_fasttext.py View on Github external
def __init__(self):
        filepath = join(dirname(__file__), "fasttext.model")
        self.estimator = fasttext.load_model(filepath)
github deepmipt / hcn-dialogue-manager / hcn / agents / hcn / emb_dict.py View on Github external
if self.opt.get('fasttext_model') is not None:
            if not os.path.isfile(self.opt['fasttext_model']):
                emb_path = os.environ.get('EMBEDDINGS_URL')
                if not emb_path:
                    raise RuntimeError('No pretrained fasttext model provided')
                fname = os.path.basename(opt['fasttext_model'])
                try:
                    print('Trying to download a pretrained fasttext model'
                        ' from the repository')
                    url = urllib.parse.urljoin(emb_path, fname)
                    urllib.request.urlretrieve(url, opt['fasttext_model'])
                    print('Downloaded a fasttext model')
                except Exception as e:
                    raise RuntimeError('Looks like the `EMBEDDINGS_URL` variable'
                                    ' is set incorrectly', e)
            self.fasttext_model = fasttext.load_model(opt['fasttext_model'])
            if self.tok2emb and (self.fasttext_model.dim != self.dim):
                raise RuntimeError("Fasttext model and loaded embeddings have"
                                   " different dimension sizes.")
        else:
            print("No fasttext model provided: using loaded embeddings.")
github clips / clinspell / code / make_devcorpus.py View on Github external
"""
    :param corpusfile: file containing the corpus to sample from
    :param language: language from ['en', 'nl']
    :param outfile: name of devcorpus file
    :param window_size: minimal amount of tokens on each side of a generated misspelling
    :param oov: True if the generated misspellings need to be absent from the vector vocabulary
    :param samplesize: number of lines to sample
    :param editdistance: the type of edit distances generated: 1, 2 or 1 and 2 (80-20 proportion)
    """
    # load lexicon
    assert language in ['en', 'nl']
    with open('lexicon_{}.json'.format(language), 'r') as f:
        vocab = set(json.load(f))

    # load vector vocab
    model = fasttext.load_model('../data/embeddings_{}.bin'.format(language))
    vector_vocab = model.words

    # load sample
    if samplesize:
        corpus_sample(corpusfile, 'devsample.json', samplesize)
        with open('devsample.json', 'r') as f:
            corpus = json.load(f)[0]
    else:
        with open(corpusfile, 'r') as f:
            corpus = json.load(f)[0]

    # generate misspellings corpus with corrections and detection contexts
    functionlist = [letterswitch, letterdelete, letterinsert, lettersub]
    correct_spellings = []
    misspellings = []
    misspelling_contexts = []
github svakulenk0 / MemN2N-tableQA / demo / qa.py View on Github external
def __init__(self, data_dir, model_file, dataset='sim'):
        # specify pre-trained models to load in the interface
        # and the sample dataset for evaluation: test sim synth table
        self.data_dir       = data_dir
        self.data_path      = './data/%s_data_{}.txt' % dataset
        self.model_file     = './trained_model/memn2n_table_qa_model_%s.pklz' % dataset
        # self.model_file     = model_file
        self.reversed_dict  = None
        self.memory         = None
        self.model          = None
        self.loss           = None
        self.general_config = None
        # SV load model to embed OOV words
        print("Loading word embeddings model")
        self.word_model = fasttext.load_model(EMBEDDINGS_MODEL_PATH)
        # SV keep word vectors for all the dictionary words
        self.dict_vectors = {}
github elitcloud / elit / elit / dev / pos_tagger.py View on Github external
def main():
    # arguments
    args = parse_args()
    if args.log: logging.basicConfig(filename=args.log, format='%(message)s', level=logging.INFO)
    else: logging.basicConfig(format='%(message)s', level=logging.INFO)

    # x
    trn_graphs = read_graphs(args.tsv, args.trn_data)
    dev_graphs = read_graphs(args.tsv, args.dev_data)

    # lexicon
    w2v = KeyedVectors.load_word2vec_format(args.w2v, binary=True) if args.w2v else None
    f2v = fasttext.load_model(args.f2v) if args.f2v else None
    a2v = KeyedVectors.load_word2vec_format(args.a2v, binary=True) if args.a2v else None

    lexicon = POSLexicon(w2v=w2v, f2v=f2v, a2v=a2v, output_size=args.output_size)

    # model
    model = POSModel(feature_context=args.feature_context, batch_size=64, w2v_dim=100)
    model.train(trn_graphs, dev_graphs, lexicon, num_steps=args.num_steps,
                bagging_ratio=args.bagging_ratio, optimizer=args.optimizer, force_init=True)
github explosion / sense2vec / scripts / 04_fasttext_train_vectors.py View on Github external
it separately.
    """

    output_path = Path(out_dir)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")

    if fasttext_filepath:
        msg.info("Loading fastText model vectors from .bin file")
        if in_dir:
            msg.warn(f"Warning: Providing a fastText filepath overrides fastText vector training")
        fasttext_filepath = Path(fasttext_filepath)
        if not fasttext_filepath.exists() or not fasttext_filepath.is_file() or not (fasttext_filepath.suffix == '.bin'):
            msg.fail("Error: fasttext_filepath expects a fastText model .bin file", exits=1)
        fasttext_model = fasttext.load_model(str(fasttext_filepath))
        msg.good("Successfully loaded fastText model")
    elif in_dir:
        msg.info("Training fastText model vectors")
        input_path = Path(in_dir)
        # Check to see if fasttext_filepath exists
        if not input_path.exists() or not input_path.is_dir():
            msg.fail("Not a valid input directory", in_dir, exits=1)
        tmp_path = input_path / "s2v_input.tmp"
        input_files = [p for p in input_path.iterdir() if p.suffix == ".s2v"]
        if not input_files:
            msg.fail("Input directory contains no .s2v files", in_dir, exits=1)
        # fastText expects only one input file and only reads from disk and not
        # stdin, so we need to create a temporary file that concatenates the inputs
        with tmp_path.open("a", encoding="utf8") as tmp_file:
            for input_file in input_files:
                with input_file.open("r", encoding="utf8") as f:
github shaypal5 / skift / skift / util.py View on Github external
def bytes_to_python_fasttext_model(bytes_obj):
    if bytes_obj is None:
        return None
    temp_fpath = temp_model_fpath()
    with open(temp_fpath, 'wb+') as bfile:
        bfile.write(bytes_obj)
    model = load_model(temp_fpath)
    os.remove(temp_fpath)
    return model