Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def setup_batch_decode(sess):
# decode for dev-sets, in batches
global reverse_vocab, vocab, lm
if FLAGS.lmfile is not None:
print("Loading Language model from %s" % FLAGS.lmfile)
lm = kenlm.LanguageModel(FLAGS.lmfile)
print("Preparing NLC data in %s" % FLAGS.data_dir)
x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data(
FLAGS.data_dir, FLAGS.max_vocab_size,
tokenizer=get_tokenizer()) # , other_dev_path="/deep/group/nlp_data/nlc_data/ourdev/bpe")
vocab, reverse_vocab = initialize_vocabulary(vocab_path, bpe=(FLAGS.tokenizer.lower() == "bpe"))
vocab_size = len(vocab)
print("Vocabulary size: %d" % vocab_size)
print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
model = create_model(sess, vocab_size, False)
return model, x_dev, y_dev
def tokenize(sent, vocab, depth=FLAGS.num_layers):
align = pow(2, depth - 1)
token_ids = sentence_to_token_ids(sent, vocab, get_tokenizer())
ones = [1] * len(token_ids)
pad = (align - len(token_ids)) % align
token_ids += [PAD_ID] * pad
ones += [0] * pad
source = np.array(token_ids).reshape([-1, 1])
mask = np.array(ones).reshape([-1, 1])
return source, mask
def train():
"""Train a translation model using NLC data."""
# Prepare NLC data.
logging.info("Preparing NLC data in %s" % FLAGS.data_dir)
x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data(
FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer())
vocab, _ = initialize_vocabulary(vocab_path)
vocab_size = len(vocab)
logging.info("Vocabulary size: %d" % vocab_size)
if not os.path.exists(FLAGS.train_dir):
os.makedirs(FLAGS.train_dir)
file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir))
logging.getLogger().addHandler(file_handler)
print(vars(FLAGS))
with tf.Session() as sess:
logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
model = create_model(sess, vocab_size, False)
def decode():
# Prepare NLC data.
global reverse_vocab, vocab, lm
if FLAGS.lmfile is not None:
print("Loading Language model from %s" % FLAGS.lmfile)
lm = kenlm.Model(FLAGS.lmfile)
else:
print('No lmfile, better to add kenlm arpa data file')
print("Preparing NLC data in %s" % FLAGS.data_dir)
x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data(
FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer())
vocab, reverse_vocab = initialize_vocabulary(vocab_path)
vocab_size = len(vocab)
print("Vocabulary size: %d" % vocab_size)
with tf.Session() as sess:
print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
model = create_model(sess, vocab_size, False)
while True:
sent = input("Enter a sentence: ")
output_sent = fix_sent(model, sess, sent)
print("Candidate: ", output_sent)