How to use the tokenizers.BertWordPieceTokenizer function in tokenizers

To help you get started, we’ve selected a few tokenizers examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github allenai / vampire / vampire / api / tokenizer.py View on Github external
def load_huggingface_tokenizer(tokenizer_path: str) -> (Any, bool):
    if os.path.isdir(tokenizer_path):
        with open(os.path.join(tokenizer_path, 'config.json'), 'r') as f:
                config = json.load(f)
        tokenizer_type = config['tokenizer_type']
        tokenizer = {'SP': SentencePieceBPETokenizer,
                     'BBPE': ByteLevelBPETokenizer,
                     'CharBPE': CharBPETokenizer,
                     'BERT': BertWordPieceTokenizer}[tokenizer_type]
        if tokenizer_type in ['SP', 'BBPE', 'CharBPE']:
            vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.json' in x][0]
            merges_file = [x for x in os.listdir(tokenizer_path) if 'merges.txt' in x][0]
            tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file),
                                merges_file=os.path.join(tokenizer_path, merges_file))
        else:
            vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.txt' in x][0]
            tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file))
        is_transformers_tokenizer = False
    else:
        is_transformers_tokenizer = True
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return tokenizer, is_transformers_tokenizer
github allenai / vampire / scripts / train_tokenizer.py View on Github external
import glob


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file", required=False, help="input text file, use '-' for stdin")
    parser.add_argument("--tokenizer_type", type=str, choices=['BPE', 'BBPE', "BERT"], help='one of BPE, BBPE, BERT')
    parser.add_argument("--serialization_dir", help='path to output BPE model')
    parser.add_argument("--vocab_size", help='YTTM vocab size', type=int, default=10000)
    args = parser.parse_args()
    # Initialize a tokenizer
    
    tokenizer = {
                'BPE': SentencePieceBPETokenizer,
                'BBPE': ByteLevelBPETokenizer,
                'BERT': BertWordPieceTokenizer
                }[args.tokenizer_type]

    tokenizer = tokenizer()

    # Then train it!
    tokenizer.train(args.input_file, vocab_size=args.vocab_size)
    if not os.path.isdir(args.serialization_dir):
        os.makedirs(args.serialization_dir)
    tokenizer.save(args.serialization_dir, 'tokenizer')
    with open(os.path.join(args.serialization_dir, "config.json"), "w+") as f:
        config = vars(args)
        json.dump(config, f)
github explosion / prodigy-recipes / other / transformers_tokenizers.py View on Github external
"""Example recipe that shows how to use model-specific tokenizers like the
    BERT word piece tokenizer to preprocess your incoming text for fast and
    efficient NER annotation and to make sure that all annotations you collect
    always map to tokens and can be used to train and fine-tune your model
    (even if the tokenization isn't that intuitive, because word pieces). The
    selection automatically snaps to the token boundaries and you can double-click
    single tokens to select them.

    Setting "honor_token_whitespace": true will ensure that whitespace between
    tokens is only shown if whitespace is present in the original text. This
    keeps the text readable.

    Requires Prodigy v1.10+ and usese the HuggingFace tokenizers library."""
    stream = get_stream(source, loader=loader, input_key="text")
    # You can replace this with other tokenizers if needed
    tokenizer = BertWordPieceTokenizer(tokenizer_vocab, lowercase=lowercase)
    sep_token = tokenizer._parameters.get("sep_token")
    cls_token = tokenizer._parameters.get("cls_token")
    special_tokens = (sep_token, cls_token)
    wp_prefix = tokenizer._parameters.get("wordpieces_prefix")

    def add_tokens(stream):
        for eg in stream:
            tokens = tokenizer.encode(eg["text"])
            eg_tokens = []
            idx = 0
            for (text, (start, end), tid) in zip(
                tokens.tokens, tokens.offsets, tokens.ids
            ):
                # If we don't want to see special tokens, don't add them
                if hide_special and text in special_tokens:
                    continue