How to use the pycorrector.deep_context.config function in pycorrector

To help you get started, we’ve selected a few pycorrector examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github shibing624 / pycorrector / pycorrector / deep_context / evaluate.py View on Github external
with open(answer_file, mode='r', encoding='utf-8') as f:
        gold_q_id = [line.split(' ', 1)[0] for line in f]

    print_mscc_score(gold_q_id, q_id_and_sim)


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print('Please specify your input directory that contains MSCC dataset.')
        print('(Most of the case the name of the directory might be `Holmes_Training_Data`.)')
        print('sample usage: python src/eval/mscc.py ~/dataset/Holmes_Training_Data/')
        quit()
    create_mscc_dataset(sys.argv[1], 'dataset/mscc_train.txt')

    gpu_id = config.gpu_id
    model_path = config.model_path
    emb_path = config.emb_path
    # device
    use_cuda = torch.cuda.is_available() and gpu_id > -1
    if use_cuda:
        device = torch.device('cuda:{}'.format(gpu_id))
        torch.cuda.set_device(gpu_id)
    else:
        device = torch.device('cpu')

    # load model
    model, config_dict = read_model(model_path, device)
    unk_token = config_dict['unk_token']
    bos_token = config_dict['bos_token']
    eos_token = config_dict['eos_token']
github shibing624 / pycorrector / pycorrector / deep_context / infer.py View on Github external
tokens[target_pos] = unk_token
    tokens = [bos_token] + tokens + [eos_token]
    indexed_sentence = [stoi[token] if token in stoi else stoi[unk_token] for token in tokens]
    input_tokens = torch.tensor(indexed_sentence, dtype=torch.long, device=device).unsqueeze(0)
    topv, topi = model.run_inference(input_tokens, target=None, target_pos=target_pos)
    for value, key in zip(topv, topi):
        print(value.item(), itos[key.item()])


if __name__ == "__main__":
    sents = ["而 且 我 希 望 不 再 存 在 抽 [] 的 人 。",
             "男 女 分 班 的 问 题 有 什 [] 好 处 ?",
             "由 我 开 始 [] 起 。"]
    model, unk_token, bos_token, eos_token, itos, stoi, device = get_infer_data(config.model_path,
                                                                                config.emb_path,
                                                                                config.gpu_id)
    for i in sents:
        infer_one_sentence(i, model, unk_token, bos_token, eos_token, itos, stoi, device)
        print()
github shibing624 / pycorrector / pycorrector / deep_context / infer.py View on Github external
except SyntaxError:
        pass
    tokens[target_pos] = unk_token
    tokens = [bos_token] + tokens + [eos_token]
    indexed_sentence = [stoi[token] if token in stoi else stoi[unk_token] for token in tokens]
    input_tokens = torch.tensor(indexed_sentence, dtype=torch.long, device=device).unsqueeze(0)
    topv, topi = model.run_inference(input_tokens, target=None, target_pos=target_pos)
    for value, key in zip(topv, topi):
        print(value.item(), itos[key.item()])


if __name__ == "__main__":
    sents = ["而 且 我 希 望 不 再 存 在 抽 [] 的 人 。",
             "男 女 分 班 的 问 题 有 什 [] 好 处 ?",
             "由 我 开 始 [] 起 。"]
    model, unk_token, bos_token, eos_token, itos, stoi, device = get_infer_data(config.model_path,
                                                                                config.emb_path,
                                                                                config.gpu_id)
    for i in sents:
        infer_one_sentence(i, model, unk_token, bos_token, eos_token, itos, stoi, device)
        print()
github shibing624 / pycorrector / pycorrector / deep_context / preprocess.py View on Github external
word_seq = segment(text, cut_type='char', pos=False)
        word_arr.append(word_seq)
    return word_arr


def save_data_list(data_list, data_path):
    with open(data_path, 'w', encoding='utf-8') as f:
        count = 0
        for line in data_list:
            f.write(' '.join(line) + '\n')
            count += 1
        print("save line size:%d to %s" % (count, data_path))


if __name__ == '__main__':
    output_dir = os.path.dirname(config.train_path)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # train data
    train_words = []
    for path in config.raw_train_paths:
        train_words.extend(parse_xml_file(path))
    save_data_list(train_words, config.train_path)
github shibing624 / pycorrector / pycorrector / deep_context / preprocess.py View on Github external
def save_data_list(data_list, data_path):
    with open(data_path, 'w', encoding='utf-8') as f:
        count = 0
        for line in data_list:
            f.write(' '.join(line) + '\n')
            count += 1
        print("save line size:%d to %s" % (count, data_path))


if __name__ == '__main__':
    output_dir = os.path.dirname(config.train_path)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # train data
    train_words = []
    for path in config.raw_train_paths:
        train_words.extend(parse_xml_file(path))
    save_data_list(train_words, config.train_path)
github shibing624 / pycorrector / pycorrector / deep_context / evaluate.py View on Github external
if use_cuda:
        device = torch.device('cuda:{}'.format(gpu_id))
        torch.cuda.set_device(gpu_id)
    else:
        device = torch.device('cpu')

    # load model
    model, config_dict = read_model(model_path, device)
    unk_token = config_dict['unk_token']
    bos_token = config_dict['bos_token']
    eos_token = config_dict['eos_token']

    # read vocab from word_emb path
    itos, stoi = load_vocab(emb_path)

    mscc_evaluation(config.question_file,
                    config.answer_file,
                    'mscc.result',
                    model,
                    stoi,
                    unk_token=unk_token,
                    bos_token=bos_token,
                    eos_token=eos_token,
                    device=device)
github shibing624 / pycorrector / pycorrector / deep_context / train.py View on Github external
write_embedding(dataset.vocab.itos, model.criterion.W, use_cuda, emb_path)
        torch.save(model.state_dict(), model_path)
        torch.save(optimizer.state_dict(), model_path + '_optim')


if __name__ == "__main__":
    train(config.train_path,
          config.emb_path,
          config.model_path,
          config.use_mlp,
          config.batch_size,
          config.epochs,
          config.maxlen,
          config.word_embed_size,
          config.hidden_size,
          config.learning_rate,
          config.n_layers,
          config.min_freq,
          config.dropout,
          config.gpu_id)
github shibing624 / pycorrector / pycorrector / deep_context / train.py View on Github external
print('epoch:[{}/{}], total_loss:[{}], best_cur_loss:[{}]'
              .format(epoch + 1, epochs, total_loss.item(), best_loss))


def save_checkpoint(epoch, model, optimizer, model_path, dataset, use_cuda, emb_path, is_best):
    write_embedding(dataset.vocab.itos, model.criterion.W, use_cuda, emb_path + '.epoch_' + str(epoch + 1))
    torch.save(model.state_dict(), model_path + '.epoch_' + str(epoch + 1))
    torch.save(optimizer.state_dict(), model_path + '_optim' + '.epoch_' + str(epoch + 1))
    if is_best:
        write_embedding(dataset.vocab.itos, model.criterion.W, use_cuda, emb_path)
        torch.save(model.state_dict(), model_path)
        torch.save(optimizer.state_dict(), model_path + '_optim')


if __name__ == "__main__":
    train(config.train_path,
          config.emb_path,
          config.model_path,
          config.use_mlp,
          config.batch_size,
          config.epochs,
          config.maxlen,
          config.word_embed_size,
          config.hidden_size,
          config.learning_rate,
          config.n_layers,
          config.min_freq,
          config.dropout,
          config.gpu_id)