How to use the tokenizers.SentencePieceBPETokenizer function in tokenizers

To help you get started, we’ve selected a few tokenizers examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github allenai / vampire / vampire / api / tokenizer.py View on Github external
def load_huggingface_tokenizer(tokenizer_path: str) -> (Any, bool):
    if os.path.isdir(tokenizer_path):
        with open(os.path.join(tokenizer_path, 'config.json'), 'r') as f:
                config = json.load(f)
        tokenizer_type = config['tokenizer_type']
        tokenizer = {'SP': SentencePieceBPETokenizer,
                     'BBPE': ByteLevelBPETokenizer,
                     'CharBPE': CharBPETokenizer,
                     'BERT': BertWordPieceTokenizer}[tokenizer_type]
        if tokenizer_type in ['SP', 'BBPE', 'CharBPE']:
            vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.json' in x][0]
            merges_file = [x for x in os.listdir(tokenizer_path) if 'merges.txt' in x][0]
            tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file),
                                merges_file=os.path.join(tokenizer_path, merges_file))
        else:
            vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.txt' in x][0]
            tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file))
        is_transformers_tokenizer = False
    else:
        is_transformers_tokenizer = True
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return tokenizer, is_transformers_tokenizer
github allenai / vampire / scripts / train_tokenizer.py View on Github external
import sys
import argparse
import glob


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file", required=False, help="input text file, use '-' for stdin")
    parser.add_argument("--tokenizer_type", type=str, choices=['BPE', 'BBPE', "BERT"], help='one of BPE, BBPE, BERT')
    parser.add_argument("--serialization_dir", help='path to output BPE model')
    parser.add_argument("--vocab_size", help='YTTM vocab size', type=int, default=10000)
    args = parser.parse_args()
    # Initialize a tokenizer
    
    tokenizer = {
                'BPE': SentencePieceBPETokenizer,
                'BBPE': ByteLevelBPETokenizer,
                'BERT': BertWordPieceTokenizer
                }[args.tokenizer_type]

    tokenizer = tokenizer()

    # Then train it!
    tokenizer.train(args.input_file, vocab_size=args.vocab_size)
    if not os.path.isdir(args.serialization_dir):
        os.makedirs(args.serialization_dir)
    tokenizer.save(args.serialization_dir, 'tokenizer')
    with open(os.path.join(args.serialization_dir, "config.json"), "w+") as f:
        config = vars(args)
        json.dump(config, f)