How to use the pycorrector.rnn_lm.config function in pycorrector

To help you get started, we’ve selected a few pycorrector examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github shibing624 / pycorrector / pycorrector / rnn_lm / train.py View on Github external
input_data=input_data,
                           output_data=output_targets,
                           vocab_size=len(word_to_int),
                           rnn_size=128,
                           num_layers=2,
                           batch_size=config.batch_size,
                           learning_rate=config.learning_rate)
    saver = tf.train.Saver(tf.global_variables())
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    # start
    with tf.Session(config=tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)) as sess:
        # init
        sess.run(init_op)

        start_epoch = 0
        checkpoint = tf.train.latest_checkpoint(config.model_dir)
        if checkpoint:
            saver.restore(sess, checkpoint)
            print("restore from the checkpoint {0}".format(checkpoint))
            start_epoch += int(checkpoint.split('-')[-1])
        print('start training...')
        try:
            for epoch in range(start_epoch, config.epochs):
                n = 0
                n_chunk = len(data_vector) // config.batch_size
                for batch in range(n_chunk):
                    loss, _, _, perplexity = sess.run([
                        end_points['total_loss'],
                        end_points['last_state'],
                        end_points['train_op'],
                        end_points['perplexity']
                    ], feed_dict={input_data: batches_inputs[n], output_targets: batches_outputs[n]})
github shibing624 / pycorrector / pycorrector / rnn_lm / infer.py View on Github external
def generate(begin_word):
    batch_size = 1
    word_to_idx = load_word_dict(config.word_dict_path)
    vocabularies = [k for k, v in word_to_idx.items()]
    tf.reset_default_graph()
    input_data = tf.placeholder(tf.int32, [batch_size, None])
    end_points = rnn_model(model='lstm',
                           input_data=input_data,
                           output_data=None,
                           vocab_size=len(word_to_idx),
                           rnn_size=128,
                           num_layers=2,
                           batch_size=64,
                           learning_rate=0.0002)

    saver = tf.train.Saver(tf.global_variables())
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    with tf.Session() as sess:
        sess.run(init_op)
github shibing624 / pycorrector / pycorrector / rnn_lm / infer.py View on Github external
tf.reset_default_graph()
    input_data = tf.placeholder(tf.int32, [batch_size, None])
    end_points = rnn_model(model='lstm',
                           input_data=input_data,
                           output_data=None,
                           vocab_size=len(word_to_idx),
                           rnn_size=128,
                           num_layers=2,
                           batch_size=64,
                           learning_rate=0.0002)

    saver = tf.train.Saver(tf.global_variables())
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    with tf.Session() as sess:
        sess.run(init_op)
        checkpoint = tf.train.latest_checkpoint(config.model_dir)
        saver.restore(sess, checkpoint)
        print("loading model from the checkpoint {0}".format(checkpoint))
        x = np.array([list(map(word_to_idx.get, START_TOKEN))])
        [predict, last_state] = sess.run([end_points['prediction'],
                                          end_points['last_state']],
                                         feed_dict={input_data: x})
        if begin_word:
            word = begin_word
        else:
            word = to_word(predict, vocabularies)
        sentence = ''
        i = 0
        while word != END_TOKEN and word != START_TOKEN and word != UNK_TOKEN:
            sentence += word
            i += 1
            if i >= 24:
github shibing624 / pycorrector / pycorrector / rnn_lm / preprocess.py View on Github external
def save_data_list(data_list, data_path):
    with open(data_path, 'w', encoding='utf-8') as f:
        count = 0
        for line in data_list:
            f.write(' '.join(line) + '\n')
            count += 1
        print("save line size:%d to %s" % (count, data_path))


if __name__ == '__main__':
    # train data
    train_words = []
    for path in config.raw_train_paths:
        train_words.extend(parse_xml_file(path))
    save_data_list(train_words, config.train_word_path)
github shibing624 / pycorrector / pycorrector / rnn_lm / infer.py View on Github external
def ppl(sentence_list):
    result = dict()
    # load data dict
    word_to_idx = load_word_dict(config.word_dict_path)
    idx_to_word = {v: k for k, v in word_to_idx.items()}
    # init params
    batch_size = 1
    tf.reset_default_graph()
    input_data = tf.placeholder(tf.int32, [batch_size, None])
    output_targets = tf.placeholder(tf.int32, [batch_size, None])
    # init model
    end_points = rnn_model(model='lstm',
                           input_data=input_data,
                           output_data=output_targets,
                           vocab_size=len(word_to_idx),
                           rnn_size=128,
                           num_layers=2,
                           batch_size=batch_size,
                           learning_rate=config.learning_rate)
    saver = tf.train.Saver(tf.global_variables())
github shibing624 / pycorrector / pycorrector / rnn_lm / preprocess.py View on Github external
return word_arr


def save_data_list(data_list, data_path):
    with open(data_path, 'w', encoding='utf-8') as f:
        count = 0
        for line in data_list:
            f.write(' '.join(line) + '\n')
            count += 1
        print("save line size:%d to %s" % (count, data_path))


if __name__ == '__main__':
    # train data
    train_words = []
    for path in config.raw_train_paths:
        train_words.extend(parse_xml_file(path))
    save_data_list(train_words, config.train_word_path)
github shibing624 / pycorrector / pycorrector / rnn_lm / train.py View on Github external
def main(_):
    # build vocab and word dict
    data_vector, word_to_int = process_data(config.train_word_path, config.word_dict_path, config.cutoff_frequency)
    random.shuffle(data_vector)
    # batch data
    batches_inputs, batches_outputs = generate_batch(config.batch_size, data_vector, word_to_int)
    # placeholder
    input_data = tf.placeholder(tf.int32, [config.batch_size, None])
    output_targets = tf.placeholder(tf.int32, [config.batch_size, None])
    # create model
    end_points = rnn_model(model='lstm',
                           input_data=input_data,
                           output_data=output_targets,
                           vocab_size=len(word_to_int),
                           rnn_size=128,
                           num_layers=2,
                           batch_size=config.batch_size,
                           learning_rate=config.learning_rate)
    saver = tf.train.Saver(tf.global_variables())
github shibing624 / pycorrector / pycorrector / rnn_lm / infer.py View on Github external
word_to_idx = load_word_dict(config.word_dict_path)
    idx_to_word = {v: k for k, v in word_to_idx.items()}
    # init params
    batch_size = 1
    tf.reset_default_graph()
    input_data = tf.placeholder(tf.int32, [batch_size, None])
    output_targets = tf.placeholder(tf.int32, [batch_size, None])
    # init model
    end_points = rnn_model(model='lstm',
                           input_data=input_data,
                           output_data=output_targets,
                           vocab_size=len(word_to_idx),
                           rnn_size=128,
                           num_layers=2,
                           batch_size=batch_size,
                           learning_rate=config.learning_rate)
    saver = tf.train.Saver(tf.global_variables())
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    with tf.Session() as sess:
        # init op
        sess.run(init_op)
        checkpoint = tf.train.latest_checkpoint(config.model_dir)
        saver.restore(sess, checkpoint)
        print("loading model from the checkpoint {0}".format(checkpoint))

        # infer each sentence
        for sentence in sentence_list:
            ppl = 0
            # data idx
            x = [word_to_idx[c] if c in word_to_idx else word_to_idx[UNK_TOKEN] for c in sentence]
            x = [word_to_idx[START_TOKEN]] + x + [word_to_idx[END_TOKEN]]
            # print('x:', x)
github shibing624 / pycorrector / pycorrector / rnn_lm / infer.py View on Github external
output_targets = tf.placeholder(tf.int32, [batch_size, None])
    # init model
    end_points = rnn_model(model='lstm',
                           input_data=input_data,
                           output_data=output_targets,
                           vocab_size=len(word_to_idx),
                           rnn_size=128,
                           num_layers=2,
                           batch_size=batch_size,
                           learning_rate=config.learning_rate)
    saver = tf.train.Saver(tf.global_variables())
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    with tf.Session() as sess:
        # init op
        sess.run(init_op)
        checkpoint = tf.train.latest_checkpoint(config.model_dir)
        saver.restore(sess, checkpoint)
        print("loading model from the checkpoint {0}".format(checkpoint))

        # infer each sentence
        for sentence in sentence_list:
            ppl = 0
            # data idx
            x = [word_to_idx[c] if c in word_to_idx else word_to_idx[UNK_TOKEN] for c in sentence]
            x = [word_to_idx[START_TOKEN]] + x + [word_to_idx[END_TOKEN]]
            # print('x:', x)
            # reshape
            y = np.array(x[1:]).reshape((-1, batch_size))
            x = np.array(x[:-1]).reshape((-1, batch_size))
            # get each word perplexity
            word_count = x.shape[0]
            for i in range(word_count):