How to use pycorrector - 10 common examples

To help you get started, we’ve selected a few pycorrector examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github shibing624 / pycorrector / pycorrector / seq2seq / train_generator.py View on Github external
for t, char in enumerate(input_text):
            encoder_input_data[i, t, input_token_index[char]] = 1.0
        for t, char in enumerate(target_text):
            # decoder_target_data is a head of decoder_input_data by one timestep
            decoder_input_data[i, t, target_token_index[char]] = 1.0
            if t > 0:
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    logger.info("Data loaded.")

    # split to train and val
    encoder_input_data_train, encoder_input_data_val, decoder_input_data_train, decoder_input_data_val, \
    decoder_target_data_train, decoder_target_data_val = train_test_split(
        encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.1)

    # model
    logger.info("Training seq2seq model...")
    model, encoder_model, decoder_model = create_model(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim)
    # Run training
    callbacks_list = callback(save_model_path, logger)
    model.fit_generator(
        generator=data_generator(encoder_input_data_train, decoder_input_data_train, decoder_target_data_train,
                                 batch_size),
        steps_per_epoch=(len(encoder_input_data_train) + batch_size - 1) // batch_size,
        epochs=epochs,
        verbose=1,
        validation_data=([encoder_input_data_val, decoder_input_data_val], decoder_target_data_val),
        callbacks=callbacks_list)
    encoder_model.save(encoder_model_path)
    decoder_model.save(decoder_model_path)
    logger.info("Model save to " + save_model_path)
    logger.info("Training has finished.")
github shibing624 / pycorrector / pycorrector / seq2seq / train_generator.py View on Github external
save_word_dict(target_token_index, save_target_token_path)

    encoder_input_data = np.zeros((len(input_texts), max_input_texts_len, num_encoder_tokens), dtype='float32')
    decoder_input_data = np.zeros((len(input_texts), max_target_texts_len, num_decoder_tokens), dtype='float32')
    decoder_target_data = np.zeros((len(input_texts), max_target_texts_len, num_decoder_tokens), dtype='float32')

    # one hot representation
    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(input_text):
            encoder_input_data[i, t, input_token_index[char]] = 1.0
        for t, char in enumerate(target_text):
            # decoder_target_data is a head of decoder_input_data by one timestep
            decoder_input_data[i, t, target_token_index[char]] = 1.0
            if t > 0:
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    logger.info("Data loaded.")

    # split to train and val
    encoder_input_data_train, encoder_input_data_val, decoder_input_data_train, decoder_input_data_val, \
    decoder_target_data_train, decoder_target_data_val = train_test_split(
        encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.1)

    # model
    logger.info("Training seq2seq model...")
    model, encoder_model, decoder_model = create_model(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim)
    # Run training
    callbacks_list = callback(save_model_path, logger)
    model.fit_generator(
        generator=data_generator(encoder_input_data_train, decoder_input_data_train, decoder_target_data_train,
                                 batch_size),
        steps_per_epoch=(len(encoder_input_data_train) + batch_size - 1) // batch_size,
        epochs=epochs,
github shibing624 / pycorrector / pycorrector / rnn_lm / train.py View on Github external
input_data=input_data,
                           output_data=output_targets,
                           vocab_size=len(word_to_int),
                           rnn_size=128,
                           num_layers=2,
                           batch_size=config.batch_size,
                           learning_rate=config.learning_rate)
    saver = tf.train.Saver(tf.global_variables())
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    # start
    with tf.Session(config=tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)) as sess:
        # init
        sess.run(init_op)

        start_epoch = 0
        checkpoint = tf.train.latest_checkpoint(config.model_dir)
        if checkpoint:
            saver.restore(sess, checkpoint)
            print("restore from the checkpoint {0}".format(checkpoint))
            start_epoch += int(checkpoint.split('-')[-1])
        print('start training...')
        try:
            for epoch in range(start_epoch, config.epochs):
                n = 0
                n_chunk = len(data_vector) // config.batch_size
                for batch in range(n_chunk):
                    loss, _, _, perplexity = sess.run([
                        end_points['total_loss'],
                        end_points['last_state'],
                        end_points['train_op'],
                        end_points['perplexity']
                    ], feed_dict={input_data: batches_inputs[n], output_targets: batches_outputs[n]})
github shibing624 / pycorrector / pycorrector / seq2seq / train_generator.py View on Github external
# model
    logger.info("Training seq2seq model...")
    model, encoder_model, decoder_model = create_model(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim)
    # Run training
    callbacks_list = callback(save_model_path, logger)
    model.fit_generator(
        generator=data_generator(encoder_input_data_train, decoder_input_data_train, decoder_target_data_train,
                                 batch_size),
        steps_per_epoch=(len(encoder_input_data_train) + batch_size - 1) // batch_size,
        epochs=epochs,
        verbose=1,
        validation_data=([encoder_input_data_val, decoder_input_data_val], decoder_target_data_val),
        callbacks=callbacks_list)
    encoder_model.save(encoder_model_path)
    decoder_model.save(decoder_model_path)
    logger.info("Model save to " + save_model_path)
    logger.info("Training has finished.")

    evaluate(encoder_model, decoder_model, num_encoder_tokens,
             num_decoder_tokens, rnn_hidden_dim, target_token_index,
             max_target_texts_len, encoder_input_data_val, input_texts)
github shibing624 / pycorrector / pycorrector / seq2seq / train_generator.py View on Github external
logger.info("Training seq2seq model...")
    model, encoder_model, decoder_model = create_model(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim)
    # Run training
    callbacks_list = callback(save_model_path, logger)
    model.fit_generator(
        generator=data_generator(encoder_input_data_train, decoder_input_data_train, decoder_target_data_train,
                                 batch_size),
        steps_per_epoch=(len(encoder_input_data_train) + batch_size - 1) // batch_size,
        epochs=epochs,
        verbose=1,
        validation_data=([encoder_input_data_val, decoder_input_data_val], decoder_target_data_val),
        callbacks=callbacks_list)
    encoder_model.save(encoder_model_path)
    decoder_model.save(decoder_model_path)
    logger.info("Model save to " + save_model_path)
    logger.info("Training has finished.")

    evaluate(encoder_model, decoder_model, num_encoder_tokens,
             num_decoder_tokens, rnn_hidden_dim, target_token_index,
             max_target_texts_len, encoder_input_data_val, input_texts)
github shibing624 / pycorrector / pycorrector / seq2seq_attention / train.py View on Github external
dropout=dropout,
                             gpu_id=gpu_id
                             ).build_model()
    evaluator = Evaluate(model, attn_model_path, vocab2id, id2vocab, maxlen)
    earlystop = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto')
    model.fit_generator(data_generator(source_texts, target_texts, vocab2id, batch_size, maxlen),
                        steps_per_epoch=(len(source_texts) + batch_size - 1) // batch_size,
                        epochs=epochs,
                        validation_data=get_validation_data(test_input_texts, test_target_texts, vocab2id, maxlen),
                        callbacks=[evaluator, earlystop])


if __name__ == "__main__":
    train(train_path=config.train_path,
          test_path=config.test_path,
          save_vocab_path=config.save_vocab_path,
          attn_model_path=config.attn_model_path,
          batch_size=config.batch_size,
          epochs=config.epochs,
          maxlen=config.maxlen,
          hidden_dim=config.rnn_hidden_dim,
          dropout=config.dropout,
          vocab_max_size=config.vocab_max_size,
          vocab_min_count=config.vocab_min_count,
          gpu_id=config.gpu_id)
github shibing624 / pycorrector / pycorrector / seq2seq_attention / train.py View on Github external
attn_model_path=attn_model_path,
                             hidden_dim=hidden_dim,
                             dropout=dropout,
                             gpu_id=gpu_id
                             ).build_model()
    evaluator = Evaluate(model, attn_model_path, vocab2id, id2vocab, maxlen)
    earlystop = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto')
    model.fit_generator(data_generator(source_texts, target_texts, vocab2id, batch_size, maxlen),
                        steps_per_epoch=(len(source_texts) + batch_size - 1) // batch_size,
                        epochs=epochs,
                        validation_data=get_validation_data(test_input_texts, test_target_texts, vocab2id, maxlen),
                        callbacks=[evaluator, earlystop])


if __name__ == "__main__":
    train(train_path=config.train_path,
          test_path=config.test_path,
          save_vocab_path=config.save_vocab_path,
          attn_model_path=config.attn_model_path,
          batch_size=config.batch_size,
          epochs=config.epochs,
          maxlen=config.maxlen,
          hidden_dim=config.rnn_hidden_dim,
          dropout=config.dropout,
          vocab_max_size=config.vocab_max_size,
          vocab_min_count=config.vocab_min_count,
          gpu_id=config.gpu_id)
github shibing624 / pycorrector / pycorrector / corrector.py View on Github external
# same first pinyin
        confusion_char_set = _get_confusion_set(word[0])
        confusion = [i + word[1:] for i in confusion_char_set if i]
        candidates_2_order.extend(confusion)
        # same last pinyin
        confusion_char_set = _get_confusion_set(word[-1])
        confusion = [word[:-1] + i for i in confusion_char_set]
        candidates_2_order.extend(confusion)
        if len(word) > 2:
            # same mid pinyin
            confusion_char_set = _get_confusion_set(word[1])
            confusion = [word[0] + i + word[2:] for i in confusion_char_set]
            candidates_3_order.extend(confusion)
    # add all confusion word list
    confusion_word_set = set(candidates_1_order + candidates_2_order + candidates_3_order)
    confusion_word_list = [item for item in confusion_word_set if is_chinese_string(item)]
    confusion_sorted = sorted(confusion_word_list, key=lambda k: \
        get_frequency(k), reverse=True)
    return confusion_sorted[:len(confusion_word_list) // fraction + 1]
github shibing624 / pycorrector / tests / long_text_error.py View on Github external
def demo2():
    for i in x:
        print(i, pycorrector.detect(i))
        print(i, pycorrector.correct(i))
github shibing624 / pycorrector / tests / history_bug.py View on Github external
# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description: 
"""

import pycorrector

corrected_sent, detail = pycorrector.correct('cv')
print(corrected_sent, detail)