How to use the transformers.AutoTokenizer function in transformers

To help you get started, we’ve selected a few transformers examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github koursaros-ai / nboost / nboost / models / pt_models / bert.py View on Github external
def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.logger.info('Loading from checkpoint %s' % self.model_dir)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        if self.device == torch.device("cpu"):
            self.logger.info("RUNNING ON CPU")
        else:
            self.logger.info("RUNNING ON CUDA")
            torch.cuda.synchronize(self.device)

        self.rerank_model = AutoModelForSequenceClassification.from_pretrained(self.model_dir)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
        self.rerank_model.to(self.device, non_blocking=True)
github catalyst-team / catalyst / catalyst / data / nlp / classify.py View on Github external
"""

        self.texts = texts
        self.labels = labels
        self.label_dict = label_dict
        self.max_seq_length = max_seq_length

        if self.label_dict is None and labels is not None:
            # {'class1': 0, 'class2': 1, 'class3': 2, ...}
            # using this instead of `sklearn.preprocessing.LabelEncoder`
            # no easily handle unknown target values
            self.label_dict = dict(zip(sorted(set(labels)),
                                       range(len(set(labels)))))

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # suppresses tokenizer warnings
        logging.getLogger(
            "transformers.tokenization_utils").setLevel(logging.FATAL)

        # special tokens for transformers
        # in the simplest case a [CLS] token is added in the beginning
        # and [SEP] token is added in the end of a piece of text
        # [CLS]  [SEP] .. <[PAD]>
        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
        self.pad_vid = self.tokenizer.vocab["[PAD]"]
github koursaros-ai / nboost / neural_rerank / models / dbert.py View on Github external
AutoModelForSequenceClassification,
                                  AutoTokenizer,
                                  AdamW,
                                  ConstantLRSchedule)

        super().__init__(*args, **kwargs)
        model_config = AutoConfig.from_pretrained(self.model_name, cache_dir=self.data_dir)
        model_config.num_labels = 1  # set up for regression
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if self.device == "cpu":
            self.logger.info("RUNNING ON CPU")
        self.rerank_model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            config=model_config,
            cache_dir=self.data_dir)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.data_dir)
        self.rerank_model.to(self.device)

        self.optimizer = AdamW(self.rerank_model.parameters(), lr=self.lr, correct_bias=False)
        self.scheduler = ConstantLRSchedule(self.optimizer)
github huggingface / transformers / examples / run_seq2seq_finetuning.py View on Github external
)
    args = parser.parse_args()

    if args.model_type != "bert":
        raise ValueError(
            "Only the BERT architecture is currently supported for seq2seq."
        )

    # Set up training device
    # device = torch.device("cpu")

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    model = Model2Model.from_pretrained(args.model_name_or_path)
    # model.to(device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    train_dataset = load_and_cache_examples(args, tokenizer)
    global_step, tr_loss = train(args, train_dataset, model, tokenizer)
    # logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
github huggingface / transformers / hubconf.py View on Github external
def tokenizer(*args, **kwargs):
    r""" 
        # Using torch.hub !
        import torch

        tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
        tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`

    """

    return AutoTokenizer.from_pretrained(*args, **kwargs)
github allenai / vampire / vampire / api / tokenizer.py View on Github external
tokenizer = {'SP': SentencePieceBPETokenizer,
                     'BBPE': ByteLevelBPETokenizer,
                     'CharBPE': CharBPETokenizer,
                     'BERT': BertWordPieceTokenizer}[tokenizer_type]
        if tokenizer_type in ['SP', 'BBPE', 'CharBPE']:
            vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.json' in x][0]
            merges_file = [x for x in os.listdir(tokenizer_path) if 'merges.txt' in x][0]
            tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file),
                                merges_file=os.path.join(tokenizer_path, merges_file))
        else:
            vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.txt' in x][0]
            tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file))
        is_transformers_tokenizer = False
    else:
        is_transformers_tokenizer = True
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return tokenizer, is_transformers_tokenizer
github huggingface / transformers / examples / benchmarks.py View on Github external
def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16):
    for c, model_name in enumerate(model_names):
        print(f"{c + 1} / {len(model_names)}")
        config = AutoConfig.from_pretrained(model_name, torchscript=torchscript)
        model = AutoModel.from_pretrained(model_name, config=config)
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)

        max_input_size = tokenizer.max_model_input_sizes[model_name]
        batch_sizes = [1, 2, 4, 8]
        slice_sizes = [8, 64, 128, 256, 512, 1024]

        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}

        for batch_size in batch_sizes:
            if fp16:
                model.half()
            model.to(device)
            model.eval()
            for slice_size in slice_sizes:
github alexandrainst / danlp / danlp / models / bert_models.py View on Github external
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
        from transformers import AutoModelForTokenClassification
        from transformers import AutoTokenizer

        # download the model or load the model path
        weights_path = download_model('bert.ner', cache_dir,
                                      process_func=_unzip_process_func,
                                      verbose=verbose)

        self.label_list = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
                           "I-ORG", "B-LOC", "I-LOC"]

        self.model = AutoModelForTokenClassification.from_pretrained(weights_path)
        self.tokenizer = AutoTokenizer.from_pretrained(weights_path)
github bheinzerling / dougu / dougu / transformers.py View on Github external
def __init__(self, model_name, device=None, max_len=None):
        super().__init__()
        self.model_name = model_name
        self.device = device or _device
        self.log = get_logger()
        do_lower_case = "uncased" in model_name
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name, do_lower_case=do_lower_case)
        # self.begin_mention_idx = self.tokenizer.convert_tokens_to_ids(
        #     self.BEGIN_MENTION)

        if self.model_name.startswith('roberta'):
            self.BEGIN_MENTION = 'madeupword0000'
            self.END_MENTION = 'madeupword0001'
            self.add_special_symbols = self.add_special_symbols_roberta
        else:
            self.BEGIN_MENTION = '[unused0]'
            self.END_MENTION = '[unused1]'
            self.add_special_symbols = self.add_special_symbols_bert
        self.BEGIN_MENTION_IDX = self.tokenizer.convert_tokens_to_ids(
            self.BEGIN_MENTION)
        self.begin_mention_idx = self.BEGIN_MENTION_IDX
        self.END_MENTION_IDX = self.tokenizer.convert_tokens_to_ids(
github huggingface / transformers / examples / benchmarks.py View on Github external
def _compute_tensorflow(model_names, dictionary, average_over, amp):
    for c, model_name in enumerate(model_names):
        print(f"{c + 1} / {len(model_names)}")
        config = AutoConfig.from_pretrained(model_name)
        model = TFAutoModel.from_pretrained(model_name, config=config)
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)

        max_input_size = tokenizer.max_model_input_sizes[model_name]
        batch_sizes = [1, 2, 4, 8]
        slice_sizes = [8, 64, 128, 256, 512, 1024]

        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}

        print("Using model", model)

        @tf.function
        def inference(inputs):
            return model(inputs)