How to use the pycorrector.rnn_attention.util.get_tokenizer function in pycorrector

To help you get started, we’ve selected a few pycorrector examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github shibing624 / pycorrector / pycorrector / rnn_attention / error_analysis.py View on Github external
def setup_batch_decode(sess):
    # decode for dev-sets, in batches
    global reverse_vocab, vocab, lm

    if FLAGS.lmfile is not None:
        print("Loading Language model from %s" % FLAGS.lmfile)
        lm = kenlm.LanguageModel(FLAGS.lmfile)

    print("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data(
        FLAGS.data_dir, FLAGS.max_vocab_size,
        tokenizer=get_tokenizer())  # , other_dev_path="/deep/group/nlp_data/nlc_data/ourdev/bpe")
    vocab, reverse_vocab = initialize_vocabulary(vocab_path, bpe=(FLAGS.tokenizer.lower() == "bpe"))
    vocab_size = len(vocab)
    print("Vocabulary size: %d" % vocab_size)

    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, vocab_size, False)

    return model, x_dev, y_dev
github shibing624 / pycorrector / pycorrector / rnn_attention / infer.py View on Github external
def tokenize(sent, vocab, depth=FLAGS.num_layers):
    align = pow(2, depth - 1)
    token_ids = sentence_to_token_ids(sent, vocab, get_tokenizer())
    ones = [1] * len(token_ids)
    pad = (align - len(token_ids)) % align

    token_ids += [PAD_ID] * pad
    ones += [0] * pad

    source = np.array(token_ids).reshape([-1, 1])
    mask = np.array(ones).reshape([-1, 1])

    return source, mask
github shibing624 / pycorrector / pycorrector / rnn_attention / train.py View on Github external
def train():
    """Train a translation model using NLC data."""
    # Prepare NLC data.
    logging.info("Preparing NLC data in %s" % FLAGS.data_dir)
    x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data(
        FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer())

    vocab, _ = initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    logging.info("Vocabulary size: %d" % vocab_size)

    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))

    with tf.Session() as sess:
        logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)
github shibing624 / pycorrector / pycorrector / rnn_attention / infer.py View on Github external
def decode():
    # Prepare NLC data.
    global reverse_vocab, vocab, lm

    if FLAGS.lmfile is not None:
        print("Loading Language model from %s" % FLAGS.lmfile)
        lm = kenlm.Model(FLAGS.lmfile)
    else:
        print('No lmfile, better to add kenlm arpa data file')

    print("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data(
        FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer())
    vocab, reverse_vocab = initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    print("Vocabulary size: %d" % vocab_size)

    with tf.Session() as sess:
        print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        while True:
            sent = input("Enter a sentence: ")

            output_sent = fix_sent(model, sess, sent)

            print("Candidate: ", output_sent)