How to use the kaggle.build_vocab function in kaggle

To help you get started, we’ve selected a few kaggle examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github sujitpal / dl-models-for-qa / src / qa-dense-autoencoder.py View on Github external
EMBED_SIZE = 64
BATCH_SIZE = 256
NBR_EPOCHS = 20

stories = kaggle.get_stories(os.path.join(DATA_DIR, STORY_FILE))
story_maxlen = max([len(words) for words in stories])

# this part is only required to get the maximum sequence length
qapairs = kaggle.get_question_answer_pairs(
    os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([story_maxlen, question_maxlen, answer_maxlen])

word2idx = kaggle.build_vocab(stories, qapairs, [])
vocab_size = len(word2idx)

Xs = kaggle.vectorize_stories(stories, word2idx, seq_maxlen)
Xstrain, Xstest = train_test_split(Xs, test_size=0.3, random_state=42)
print(Xstrain.shape, Xstest.shape)

signal = Input(shape=(seq_maxlen,))
encoded = Dense(EMBED_SIZE, init="glorot_uniform", activation="relu")(signal)
decoded = Dense(seq_maxlen, init="glorot_uniform", activation="sigmoid")(encoded)
autoencoder = Model(input=signal, output=decoded)

autoencoder.compile("adadelta", loss="binary_crossentropy")

autoencoder.fit(Xstrain, Xstrain, nb_epoch=NBR_EPOCHS, batch_size=BATCH_SIZE,
                shuffle=True, validation_data=(Xstest, Xstest))
github sujitpal / dl-models-for-qa / src / qa-blstm-attn.py View on Github external
QA_TRAIN_FILE = "8thGr-NDMC-Train.csv"

QA_EMBED_SIZE = 64
BATCH_SIZE = 32
NBR_EPOCHS = 20

## extract data

print("Loading and formatting data...")
qapairs = kaggle.get_question_answer_pairs(
    os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([question_maxlen, answer_maxlen])

word2idx = kaggle.build_vocab([], qapairs, [])
vocab_size = len(word2idx) + 1 # include mask character 0

Xq, Xa, Y = kaggle.vectorize_qapairs(qapairs, word2idx, seq_maxlen)
Xqtrain, Xqtest, Xatrain, Xatest, Ytrain, Ytest = \
    train_test_split(Xq, Xa, Y, test_size=0.3, random_state=42)
print(Xqtrain.shape, Xqtest.shape, Xatrain.shape, Xatest.shape, 
      Ytrain.shape, Ytest.shape)

# get embeddings from word2vec
# see https://github.com/fchollet/keras/issues/853
print("Loading Word2Vec model and generating embedding matrix...")
word2vec = Word2Vec.load_word2vec_format(
    os.path.join(DATA_DIR, WORD2VEC_BIN), binary=True)
embedding_weights = np.zeros((vocab_size, WORD2VEC_EMBED_SIZE))
for word, index in word2idx.items():
    try:
github sujitpal / dl-models-for-qa / src / qa-lstm-fem-attn.py View on Github external
qapairs = kaggle.get_question_answer_pairs(
    os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])

# Even though we don't use the test set for classification, we still need
# to consider any additional vocabulary words from it for when we use the
# model for prediction (against the test set).
tqapairs = kaggle.get_question_answer_pairs(
    os.path.join(DATA_DIR, QA_TEST_FILE), is_test=True)    
tq_maxlen = max([len(qapair[0]) for qapair in tqapairs])
ta_maxlen = max([len(qapair[1]) for qapair in tqapairs])

seq_maxlen = max([question_maxlen, answer_maxlen, tq_maxlen, ta_maxlen])

word2idx = kaggle.build_vocab([], qapairs, tqapairs)
vocab_size = len(word2idx) + 1 # include mask character 0

Xq, Xa, Y = kaggle.vectorize_qapairs(qapairs, word2idx, seq_maxlen)
Xqtrain, Xqtest, Xatrain, Xatest, Ytrain, Ytest = \
    train_test_split(Xq, Xa, Y, test_size=0.3, random_state=42)
print(Xqtrain.shape, Xqtest.shape, Xatrain.shape, Xatest.shape, 
      Ytrain.shape, Ytest.shape)

# get embeddings from word2vec
print("Loading Word2Vec model and generating embedding matrix...")
embedding_weights = kaggle.get_weights_word2vec(word2idx,
    os.path.join(DATA_DIR, WORD2VEC_BIN), is_custom=True)
        
print("Building model...")

# output: (None, QA_EMBED_SIZE, seq_maxlen)
github sujitpal / dl-models-for-qa / src / qa-lstm-fem.py View on Github external
QA_TRAIN_FILE = "8thGr-NDMC-Train.csv"

QA_EMBED_SIZE = 64
BATCH_SIZE = 32
NBR_EPOCHS = 20

## extract data

print("Loading and formatting data...")
qapairs = kaggle.get_question_answer_pairs(
    os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([question_maxlen, answer_maxlen])

word2idx = kaggle.build_vocab([], qapairs, [])
vocab_size = len(word2idx) + 1 # include mask character 0

Xq, Xa, Y = kaggle.vectorize_qapairs(qapairs, word2idx, seq_maxlen)
Xqtrain, Xqtest, Xatrain, Xatest, Ytrain, Ytest = \
    train_test_split(Xq, Xa, Y, test_size=0.3, random_state=42)
print(Xqtrain.shape, Xqtest.shape, Xatrain.shape, Xatest.shape, 
      Ytrain.shape, Ytest.shape)

print("Loading flashcard Word2Vec model and generating embedding matrix...")
word2vec = Word2Vec.load(os.path.join(DATA_DIR, WORD2VEC_MODEL))
embedding_weights = np.zeros((vocab_size, WORD2VEC_EMBED_SIZE))
for word, index in word2idx.items():
    try:
        embedding_weights[index, :] = word2vec[word.lower()]
    except KeyError:
        pass  # keep as zero (not ideal, but what else can we do?)
github sujitpal / dl-models-for-qa / src / qa-blstm-fem-attn.py View on Github external
QA_TRAIN_FILE = "8thGr-NDMC-Train.csv"

QA_EMBED_SIZE = 64
BATCH_SIZE = 32
NBR_EPOCHS = 20

## extract data

print("Loading and formatting data...")
qapairs = kaggle.get_question_answer_pairs(
    os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([question_maxlen, answer_maxlen])

word2idx = kaggle.build_vocab([], qapairs, [])
vocab_size = len(word2idx) + 1 # include mask character 0

Xq, Xa, Y = kaggle.vectorize_qapairs(qapairs, word2idx, seq_maxlen)
Xqtrain, Xqtest, Xatrain, Xatest, Ytrain, Ytest = \
    train_test_split(Xq, Xa, Y, test_size=0.3, random_state=42)
print(Xqtrain.shape, Xqtest.shape, Xatrain.shape, Xatest.shape, 
      Ytrain.shape, Ytest.shape)

# get embeddings from word2vec
# see https://github.com/fchollet/keras/issues/853
print("Loading Word2Vec model and generating embedding matrix...")
word2vec = Word2Vec.load(os.path.join(DATA_DIR, WORD2VEC_BIN))
embedding_weights = np.zeros((vocab_size, WORD2VEC_EMBED_SIZE))
for word, index in word2idx.items():
    try:
        embedding_weights[index, :] = word2vec[word.lower()]
github sujitpal / dl-models-for-qa / src / qa-lstm.py View on Github external
QA_TRAIN_FILE = "8thGr-NDMC-Train.csv"

QA_EMBED_SIZE = 64
BATCH_SIZE = 32
NBR_EPOCHS = 20

## extract data

print("Loading and formatting data...")
qapairs = kaggle.get_question_answer_pairs(
    os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([question_maxlen, answer_maxlen])

word2idx = kaggle.build_vocab([], qapairs, [])
vocab_size = len(word2idx) + 1 # include mask character 0

Xq, Xa, Y = kaggle.vectorize_qapairs(qapairs, word2idx, seq_maxlen)
Xqtrain, Xqtest, Xatrain, Xatest, Ytrain, Ytest = \
    train_test_split(Xq, Xa, Y, test_size=0.3, random_state=42)
print(Xqtrain.shape, Xqtest.shape, Xatrain.shape, Xatest.shape, 
      Ytrain.shape, Ytest.shape)

# get embeddings from word2vec
# see https://github.com/fchollet/keras/issues/853
print("Loading Word2Vec model and generating embedding matrix...")
word2vec = Word2Vec.load_word2vec_format(
    os.path.join(DATA_DIR, WORD2VEC_BIN), binary=True)
embedding_weights = np.zeros((vocab_size, WORD2VEC_EMBED_SIZE))
for word, index in word2idx.items():
    try:
github sujitpal / dl-models-for-qa / src / qa-blstm.py View on Github external
QA_TRAIN_FILE = "8thGr-NDMC-Train.csv"

QA_EMBED_SIZE = 64
BATCH_SIZE = 32
NBR_EPOCHS = 20

## extract data

print("Loading and formatting data...")
qapairs = kaggle.get_question_answer_pairs(
    os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([question_maxlen, answer_maxlen])

word2idx = kaggle.build_vocab([], qapairs, [])
vocab_size = len(word2idx) + 1 # include mask character 0

Xq, Xa, Y = kaggle.vectorize_qapairs(qapairs, word2idx, seq_maxlen)
Xqtrain, Xqtest, Xatrain, Xatest, Ytrain, Ytest = \
    train_test_split(Xq, Xa, Y, test_size=0.3, random_state=42)
print(Xqtrain.shape, Xqtest.shape, Xatrain.shape, Xatest.shape, 
      Ytrain.shape, Ytest.shape)

# get embeddings from word2vec
print("Loading Word2Vec model and generating embedding matrix...")
word2vec = Word2Vec.load_word2vec_format(
    os.path.join(DATA_DIR, WORD2VEC_BIN), binary=True)
embedding_weights = np.zeros((vocab_size, WORD2VEC_EMBED_SIZE))
for word, index in word2idx.items():
    try:
        embedding_weights[index, :] = word2vec[word.lower()]
github sujitpal / dl-models-for-qa / src / qa-lstm-autoencoder.py View on Github external
EMBED_SIZE = 64
BATCH_SIZE = 256
NBR_EPOCHS = 20

stories = kaggle.get_stories(os.path.join(DATA_DIR, STORY_FILE))
story_maxlen = max([len(words) for words in stories])

# this part is only required to get the maximum sequence length
qapairs = kaggle.get_question_answer_pairs(
    os.path.join(DATA_DIR, QA_TRAIN_FILE))
question_maxlen = max([len(qapair[0]) for qapair in qapairs])
answer_maxlen = max([len(qapair[1]) for qapair in qapairs])
seq_maxlen = max([story_maxlen, question_maxlen, answer_maxlen])

word2idx = kaggle.build_vocab(stories, qapairs, [])
vocab_size = len(word2idx)

Xs = kaggle.vectorize_stories(stories, word2idx, seq_maxlen)
Xstrain, Xstest = train_test_split(Xs, test_size=0.3, random_state=42)
print(Xstrain.shape, Xstest.shape)

inputs = Input(shape=(seq_maxlen, vocab_size))
encoded = LSTM(EMBED_SIZE)(inputs)
decoded = RepeatVector(seq_maxlen)(encoded)
decoded = LSTM(vocab_size, return_sequences=True)(decoded)
autoencoder = Model(inputs, decoded)

autoencoder.compile("adadelta", loss="binary_crossentropy")

autoencoder.fit(Xstrain, Xstrain, nb_epoch=NBR_EPOCHS, batch_size=BATCH_SIZE,
                shuffle=True, validation_data=(Xstest, Xstest))
github sujitpal / dl-models-for-qa / src / deploy-model.py View on Github external
QA_TRAIN_FILE = "8thGr-NDMC-Train.csv"
QA_TEST_FILE = "8thGr-NDMC-Test.csv"

WORD2VEC_BIN = "GoogleNews-vectors-negative300.bin.gz"
WORD2VEC_EMBED_SIZE = 300

LSTM_SEQLEN = 196 # from original model
NUM_CHOICES = 4   # number of choices for multiple choice

#### Load up the vectorizer
qapairs = kaggle.get_question_answer_pairs(
    os.path.join(DATA_DIR, QA_TRAIN_FILE))
tqapairs = kaggle.get_question_answer_pairs(
    os.path.join(DATA_DIR, QA_TEST_FILE), is_test=True)

word2idx = kaggle.build_vocab([], qapairs, tqapairs)
vocab_size = len(word2idx) + 1 # include mask character 0

#### Load up the model
with open(os.path.join(MODEL_DIR, MODEL_ARCH), "rb") as fjson:
    json = fjson.read()
model = model_from_json(json)
model.load_weights(os.path.join(MODEL_DIR, MODEL_WEIGHTS))

#### read in the data ####
#### correct_answer = "B"
question = "Which is a distinction between an epidemic and a pandemic?"
answers = ["the symptoms of the disease",
           "the geographical area affected",
           "the species of organisms infected",
           "the season in which the disease spreads"]
qwords = nltk.word_tokenize(question)