Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for t, char in enumerate(input_text):
encoder_input_data[i, t, input_token_index[char]] = 1.0
for t, char in enumerate(target_text):
# decoder_target_data is a head of decoder_input_data by one timestep
decoder_input_data[i, t, target_token_index[char]] = 1.0
if t > 0:
decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
logger.info("Data loaded.")
# split to train and val
encoder_input_data_train, encoder_input_data_val, decoder_input_data_train, decoder_input_data_val, \
decoder_target_data_train, decoder_target_data_val = train_test_split(
encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.1)
# model
logger.info("Training seq2seq model...")
model, encoder_model, decoder_model = create_model(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim)
# Run training
callbacks_list = callback(save_model_path, logger)
model.fit_generator(
generator=data_generator(encoder_input_data_train, decoder_input_data_train, decoder_target_data_train,
batch_size),
steps_per_epoch=(len(encoder_input_data_train) + batch_size - 1) // batch_size,
epochs=epochs,
verbose=1,
validation_data=([encoder_input_data_val, decoder_input_data_val], decoder_target_data_val),
callbacks=callbacks_list)
encoder_model.save(encoder_model_path)
decoder_model.save(decoder_model_path)
logger.info("Model save to " + save_model_path)
logger.info("Training has finished.")
save_word_dict(target_token_index, save_target_token_path)
encoder_input_data = np.zeros((len(input_texts), max_input_texts_len, num_encoder_tokens), dtype='float32')
decoder_input_data = np.zeros((len(input_texts), max_target_texts_len, num_decoder_tokens), dtype='float32')
decoder_target_data = np.zeros((len(input_texts), max_target_texts_len, num_decoder_tokens), dtype='float32')
# one hot representation
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
for t, char in enumerate(input_text):
encoder_input_data[i, t, input_token_index[char]] = 1.0
for t, char in enumerate(target_text):
# decoder_target_data is a head of decoder_input_data by one timestep
decoder_input_data[i, t, target_token_index[char]] = 1.0
if t > 0:
decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
logger.info("Data loaded.")
# split to train and val
encoder_input_data_train, encoder_input_data_val, decoder_input_data_train, decoder_input_data_val, \
decoder_target_data_train, decoder_target_data_val = train_test_split(
encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.1)
# model
logger.info("Training seq2seq model...")
model, encoder_model, decoder_model = create_model(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim)
# Run training
callbacks_list = callback(save_model_path, logger)
model.fit_generator(
generator=data_generator(encoder_input_data_train, decoder_input_data_train, decoder_target_data_train,
batch_size),
steps_per_epoch=(len(encoder_input_data_train) + batch_size - 1) // batch_size,
epochs=epochs,
input_data=input_data,
output_data=output_targets,
vocab_size=len(word_to_int),
rnn_size=128,
num_layers=2,
batch_size=config.batch_size,
learning_rate=config.learning_rate)
saver = tf.train.Saver(tf.global_variables())
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
# start
with tf.Session(config=tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)) as sess:
# init
sess.run(init_op)
start_epoch = 0
checkpoint = tf.train.latest_checkpoint(config.model_dir)
if checkpoint:
saver.restore(sess, checkpoint)
print("restore from the checkpoint {0}".format(checkpoint))
start_epoch += int(checkpoint.split('-')[-1])
print('start training...')
try:
for epoch in range(start_epoch, config.epochs):
n = 0
n_chunk = len(data_vector) // config.batch_size
for batch in range(n_chunk):
loss, _, _, perplexity = sess.run([
end_points['total_loss'],
end_points['last_state'],
end_points['train_op'],
end_points['perplexity']
], feed_dict={input_data: batches_inputs[n], output_targets: batches_outputs[n]})
# model
logger.info("Training seq2seq model...")
model, encoder_model, decoder_model = create_model(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim)
# Run training
callbacks_list = callback(save_model_path, logger)
model.fit_generator(
generator=data_generator(encoder_input_data_train, decoder_input_data_train, decoder_target_data_train,
batch_size),
steps_per_epoch=(len(encoder_input_data_train) + batch_size - 1) // batch_size,
epochs=epochs,
verbose=1,
validation_data=([encoder_input_data_val, decoder_input_data_val], decoder_target_data_val),
callbacks=callbacks_list)
encoder_model.save(encoder_model_path)
decoder_model.save(decoder_model_path)
logger.info("Model save to " + save_model_path)
logger.info("Training has finished.")
evaluate(encoder_model, decoder_model, num_encoder_tokens,
num_decoder_tokens, rnn_hidden_dim, target_token_index,
max_target_texts_len, encoder_input_data_val, input_texts)
logger.info("Training seq2seq model...")
model, encoder_model, decoder_model = create_model(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim)
# Run training
callbacks_list = callback(save_model_path, logger)
model.fit_generator(
generator=data_generator(encoder_input_data_train, decoder_input_data_train, decoder_target_data_train,
batch_size),
steps_per_epoch=(len(encoder_input_data_train) + batch_size - 1) // batch_size,
epochs=epochs,
verbose=1,
validation_data=([encoder_input_data_val, decoder_input_data_val], decoder_target_data_val),
callbacks=callbacks_list)
encoder_model.save(encoder_model_path)
decoder_model.save(decoder_model_path)
logger.info("Model save to " + save_model_path)
logger.info("Training has finished.")
evaluate(encoder_model, decoder_model, num_encoder_tokens,
num_decoder_tokens, rnn_hidden_dim, target_token_index,
max_target_texts_len, encoder_input_data_val, input_texts)
dropout=dropout,
gpu_id=gpu_id
).build_model()
evaluator = Evaluate(model, attn_model_path, vocab2id, id2vocab, maxlen)
earlystop = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto')
model.fit_generator(data_generator(source_texts, target_texts, vocab2id, batch_size, maxlen),
steps_per_epoch=(len(source_texts) + batch_size - 1) // batch_size,
epochs=epochs,
validation_data=get_validation_data(test_input_texts, test_target_texts, vocab2id, maxlen),
callbacks=[evaluator, earlystop])
if __name__ == "__main__":
train(train_path=config.train_path,
test_path=config.test_path,
save_vocab_path=config.save_vocab_path,
attn_model_path=config.attn_model_path,
batch_size=config.batch_size,
epochs=config.epochs,
maxlen=config.maxlen,
hidden_dim=config.rnn_hidden_dim,
dropout=config.dropout,
vocab_max_size=config.vocab_max_size,
vocab_min_count=config.vocab_min_count,
gpu_id=config.gpu_id)
attn_model_path=attn_model_path,
hidden_dim=hidden_dim,
dropout=dropout,
gpu_id=gpu_id
).build_model()
evaluator = Evaluate(model, attn_model_path, vocab2id, id2vocab, maxlen)
earlystop = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto')
model.fit_generator(data_generator(source_texts, target_texts, vocab2id, batch_size, maxlen),
steps_per_epoch=(len(source_texts) + batch_size - 1) // batch_size,
epochs=epochs,
validation_data=get_validation_data(test_input_texts, test_target_texts, vocab2id, maxlen),
callbacks=[evaluator, earlystop])
if __name__ == "__main__":
train(train_path=config.train_path,
test_path=config.test_path,
save_vocab_path=config.save_vocab_path,
attn_model_path=config.attn_model_path,
batch_size=config.batch_size,
epochs=config.epochs,
maxlen=config.maxlen,
hidden_dim=config.rnn_hidden_dim,
dropout=config.dropout,
vocab_max_size=config.vocab_max_size,
vocab_min_count=config.vocab_min_count,
gpu_id=config.gpu_id)
# same first pinyin
confusion_char_set = _get_confusion_set(word[0])
confusion = [i + word[1:] for i in confusion_char_set if i]
candidates_2_order.extend(confusion)
# same last pinyin
confusion_char_set = _get_confusion_set(word[-1])
confusion = [word[:-1] + i for i in confusion_char_set]
candidates_2_order.extend(confusion)
if len(word) > 2:
# same mid pinyin
confusion_char_set = _get_confusion_set(word[1])
confusion = [word[0] + i + word[2:] for i in confusion_char_set]
candidates_3_order.extend(confusion)
# add all confusion word list
confusion_word_set = set(candidates_1_order + candidates_2_order + candidates_3_order)
confusion_word_list = [item for item in confusion_word_set if is_chinese_string(item)]
confusion_sorted = sorted(confusion_word_list, key=lambda k: \
get_frequency(k), reverse=True)
return confusion_sorted[:len(confusion_word_list) // fraction + 1]
def demo2():
for i in x:
print(i, pycorrector.detect(i))
print(i, pycorrector.correct(i))
# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description:
"""
import pycorrector
corrected_sent, detail = pycorrector.correct('cv')
print(corrected_sent, detail)