How to use the transformers.AutoTokenizer.from_pretrained function in transformers

To help you get started, we’ve selected a few transformers examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github stefan-it / fine-tuned-berts-seq / scripts / preprocess.py View on Github external
import sys

from transformers import AutoTokenizer

dataset = sys.argv[1]
model_name_or_path = sys.argv[2]
max_len = int(sys.argv[3])

subword_len_counter = 0

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
max_len -= tokenizer.num_special_tokens_to_add()

with open(dataset, "rt") as f_p:
    for line in f_p:
        line = line.rstrip()

        if not line:
            print(line)
            subword_len_counter = 0
            continue

        token = line.split()[0]

        current_subwords_len = len(tokenizer.tokenize(token))

        # Token contains strange control characters like \x96 or \x95
github huggingface / transformers / examples / run_summarization_finetuning.py View on Github external
raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --do_overwrite_output_dir to overwrite.".format(
                args.output_dir
            )
        )

    # Set up training device
    if args.to_cpu or not torch.cuda.is_available():
        args.device = torch.device("cpu")
        args.n_gpu = 0
    else:
        args.device = torch.device("cuda")
        args.n_gpu = torch.cuda.device_count()

    # Load pretrained model and tokenizer. The decoder's weights are randomly initialized.
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    config = BertConfig.from_pretrained(args.model_name_or_path)
    decoder_model = BertForMaskedLM(config)
    model = Model2Model.from_pretrained(
        args.model_name_or_path, decoder_model=decoder_model
    )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        0,
        args.device,
github catalyst-team / catalyst / catalyst / data / nlp / classify.py View on Github external
"""

        self.texts = texts
        self.labels = labels
        self.label_dict = label_dict
        self.max_seq_length = max_seq_length

        if self.label_dict is None and labels is not None:
            # {'class1': 0, 'class2': 1, 'class3': 2, ...}
            # using this instead of `sklearn.preprocessing.LabelEncoder`
            # no easily handle unknown target values
            self.label_dict = dict(zip(sorted(set(labels)),
                                       range(len(set(labels)))))

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # suppresses tokenizer warnings
        logging.getLogger(
            "transformers.tokenization_utils").setLevel(logging.FATAL)

        # special tokens for transformers
        # in the simplest case a [CLS] token is added in the beginning
        # and [SEP] token is added in the end of a piece of text
        # [CLS]  [SEP] .. <[PAD]>
        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
        self.pad_vid = self.tokenizer.vocab["[PAD]"]
github sai-prasanna / lmproof / lmproof / scorer.py View on Github external
def load(cls, language: str, device: str = "cpu") -> "TransformerLMScorer":
        if language == "en":
            tokenizer = AutoTokenizer.from_pretrained("gpt2")
            model = AutoModelWithLMHead.from_pretrained("distilgpt2")
            return cls(tokenizer, model, device)
        else:
            raise RuntimeError(f"Language {language} is not supported.")
github koursaros-ai / nboost / neural_rerank / model / dbert.py View on Github external
from transformers import (AutoConfig,
                                  AutoModelForSequenceClassification,
                                  AutoTokenizer,
                                  AdamW,
                                  ConstantLRSchedule)

        super().__init__(*args, **kwargs)
        model_config = AutoConfig.from_pretrained(self.model_name)
        model_config.num_labels = 1  # set up for regression
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if self.device == "cpu":
            self.logger.info("RUNNING ON CPU")
        self.rerank_model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            config=model_config)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.rerank_model.to(self.device, non_blocking=True)

        self.optimizer = AdamW(self.rerank_model.parameters(), lr=self.lr, correct_bias=False)
        self.scheduler = ConstantLRSchedule(self.optimizer)
github kh-kim / simple-ntc / classify_plm.py View on Github external
def main(config):
    saved_data = torch.load(
        config.model_fn,
        map_location='cpu' if config.gpu_id < 0 else 'cuda:%d' % config.gpu_id
    )

    train_config = saved_data['config']
    bert_best = saved_data['bert']
    index_to_label = saved_data['classes']

    lines = read_text()

    with torch.no_grad():
        # Declare model and load pre-trained weights.
        tokenizer = AutoTokenizer.from_pretrained(train_config.pretrained_model_name)
        model = BertForSequenceClassification.from_pretrained(
            train_config.pretrained_model_name,
            num_labels=len(index_to_label)
        )
        model.load_state_dict(bert_best)

        if config.gpu_id >= 0:
            model.cuda(config.gpu_id)
        device = next(model.parameters()).device

        # Don't forget turn-on evaluation mode.
        model.eval()

        y_hats = []
        for idx in range(0, len(lines), config.batch_size):
            mini_batch = tokenizer(