How to use the tokenizers.ByteLevelBPETokenizer function in tokenizers

To help you get started, we’ve selected a few tokenizers examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github allenai / vampire / scripts / train_tokenizer.py View on Github external
import argparse
import glob


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file", required=False, help="input text file, use '-' for stdin")
    parser.add_argument("--tokenizer_type", type=str, choices=['BPE', 'BBPE', "BERT"], help='one of BPE, BBPE, BERT')
    parser.add_argument("--serialization_dir", help='path to output BPE model')
    parser.add_argument("--vocab_size", help='YTTM vocab size', type=int, default=10000)
    args = parser.parse_args()
    # Initialize a tokenizer
    
    tokenizer = {
                'BPE': SentencePieceBPETokenizer,
                'BBPE': ByteLevelBPETokenizer,
                'BERT': BertWordPieceTokenizer
                }[args.tokenizer_type]

    tokenizer = tokenizer()

    # Then train it!
    tokenizer.train(args.input_file, vocab_size=args.vocab_size)
    if not os.path.isdir(args.serialization_dir):
        os.makedirs(args.serialization_dir)
    tokenizer.save(args.serialization_dir, 'tokenizer')
    with open(os.path.join(args.serialization_dir, "config.json"), "w+") as f:
        config = vars(args)
        json.dump(config, f)
github CogStack / MedCAT / medcat / meta_cat.py View on Github external
def load(self, model='lstm', tokenizer_name='bbpe'):
        """ Loads model and config for this meta annotation
        """
        # Load tokenizer if it is None
        if self.tokenizer is None:
            vocab_file = self.save_dir + "{}-vocab.json".format(tokenizer_name)
            merges_file = self.save_dir + "{}-merges.txt".format(tokenizer_name)
            self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file, merges_file=merges_file, lowercase=True)

        # Load embeddings if None
        if self.embeddings is None:
            embeddings = np.load(open(self.save_dir  + "embeddings.npy", 'rb'))
            self.embeddings = torch.tensor(embeddings, dtype=torch.float32)

        # Load configuration
        self.load_config()

        # Load MODEL
        self.load_model(model=model)
github allenai / vampire / vampire / api / tokenizer.py View on Github external
def load_huggingface_tokenizer(tokenizer_path: str) -> (Any, bool):
    if os.path.isdir(tokenizer_path):
        with open(os.path.join(tokenizer_path, 'config.json'), 'r') as f:
                config = json.load(f)
        tokenizer_type = config['tokenizer_type']
        tokenizer = {'SP': SentencePieceBPETokenizer,
                     'BBPE': ByteLevelBPETokenizer,
                     'CharBPE': CharBPETokenizer,
                     'BERT': BertWordPieceTokenizer}[tokenizer_type]
        if tokenizer_type in ['SP', 'BBPE', 'CharBPE']:
            vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.json' in x][0]
            merges_file = [x for x in os.listdir(tokenizer_path) if 'merges.txt' in x][0]
            tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file),
                                merges_file=os.path.join(tokenizer_path, merges_file))
        else:
            vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.txt' in x][0]
            tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file))
        is_transformers_tokenizer = False
    else:
        is_transformers_tokenizer = True
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return tokenizer, is_transformers_tokenizer