How to use the transformers.RobertaTokenizer function in transformers

To help you get started, we’ve selected a few transformers examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github deepset-ai / FARM / test / test_tokenization.py View on Github external
do_lower_case=True
        )
    assert type(tokenizer) == BertTokenizer
    assert tokenizer.basic_tokenizer.do_lower_case == True

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path="xlnet-base-cased",
        do_lower_case=True
        )
    assert type(tokenizer) == XLNetTokenizer
    assert tokenizer.do_lower_case == True

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path="roberta-base"
        )
    assert type(tokenizer) == RobertaTokenizer
github huggingface / transformers / examples / run_lm_finetuning.py View on Github external
try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter


logger = logging.getLogger(__name__)


MODEL_CLASSES = {
    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
    "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
    "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
}


class TextDataset(Dataset):
    def __init__(self, tokenizer, args, file_path="train", block_size=512):
        assert os.path.isfile(file_path)
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, args.model_name_or_path + "_cached_lm_" + str(block_size) + "_" + filename
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
github koursaros-ai / microservices / utils / modeling / migrating.py View on Github external
def benchmark_mnli(samples):
    torch_hub_model = time_fn(torch.hub.load, 'pytorch/fairseq','roberta.large.mnli')
    try:
        transformers_model = time_fn(transformers.RobertaModel.from_pretrained,
                                     'roberta-large-mnli')
    except:
        transformers_model = time_fn(transformers.RobertaModel.from_pretrained,
                                     'roberta-large-mnli', force_download=True)
    transformers_tokenizer = time_fn(transformers.RobertaTokenizer.from_pretrained, 'roberta-large-mnli')
    pred_functions = {
        'transformers' : predict_transformers(transformers_model, transformers_tokenizer),
        'torch_hub' : predict_roberta(torch_hub_model)
    }
    for framework, pred_fn in pred_functions.items():
        print(f'Benchmarking {framework} with {samples} samples')
        time_fn(benchmark, pred_fn, samples)
github ThilinaRajapakse / simpletransformers / simpletransformers / experimental / classification / multi_label_classification_model.py View on Github external
def __init__(self, model_type, model_name, num_labels=None, pos_weight=None, args=None, use_cuda=True):
        """
        Initializes a MultiLabelClassification model.

        Args:
            model_type: The type of model (bert, roberta)
            model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
            num_labels (optional): The number of labels or classes in the dataset.
            pos_weight (optional): A list of length num_labels containing the weights to assign to each label for loss calculation.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
        """
        MODEL_CLASSES = {
            'bert':       (BertConfig, BertForMultiLabelSequenceClassification, BertTokenizer),
            'roberta':    (RobertaConfig, RobertaForMultiLabelSequenceClassification, RobertaTokenizer),
            'xlnet':      (XLNetConfig, XLNetForMultiLabelSequenceClassification, XLNetTokenizer),
            'xlm':        (XLMConfig, XLMForMultiLabelSequenceClassification, XLMTokenizer),
            'distilbert': (DistilBertConfig, DistilBertForMultiLabelSequenceClassification, DistilBertTokenizer),
            'albert':     (AlbertConfig, AlbertForMultiLabelSequenceClassification, AlbertTokenizer)
        }

        config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
        if num_labels:
            self.config = config_class.from_pretrained(model_name, num_labels=num_labels)
            self.num_labels = num_labels
        else:
            self.config = config_class.from_pretrained(model_name)
            self.num_labels = self.config.num_labels
        self.tokenizer = tokenizer_class.from_pretrained(model_name)
        self.tokenizer = tokenizer_class.from_pretrained(model_name)
        self.num_labels = num_labels
github ThilinaRajapakse / simpletransformers / simpletransformers / classification / multi_label_classification_model.py View on Github external
def __init__(self, model_type, model_name, num_labels=2, args=None, use_cuda=True):
        """
        Initializes a MultiLabelClassification model.

        Args:
            model_type: The type of model (bert, roberta)
            model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
            num_labels (optional): The number of labels or classes in the dataset.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
        """
        MODEL_CLASSES = {
            'bert': (BertConfig, BertForMultiLabelSequenceClassification, BertTokenizer),
            'roberta': (RobertaConfig, RobertaForMultiLabelSequenceClassification, RobertaTokenizer),
            'xlnet': (XLNetConfig, XLNetForMultiLabelSequenceClassification, XLNetTokenizer),
            'xlm': (XLMConfig, XLMForMultiLabelSequenceClassification, XLMTokenizer),
            'distilbert': (DistilBertConfig, DistilBertForMultiLabelSequenceClassification, DistilBertTokenizer),
        }

        config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
        self.tokenizer = tokenizer_class.from_pretrained(model_name)
        self.model = model_class.from_pretrained(model_name, num_labels=num_labels)
        self.num_labels = num_labels

        if use_cuda:
            if torch.cuda.is_available():
                self.device = torch.device("cuda")
            else:
                raise ValueError("'use_cuda' set to True when cuda is unavailable. Make sure CUDA is available or set use_cuda=False.")
        else:
github bhoov / exbert / server / aligner / aligner.py View on Github external
# Functional version that works with dictionaries
            return [meta_token.assoc("token", b) for b in bpe_tokens[offset:]]

        def _doc_to_fixed_tokens(self, doc: SpacyDoc) -> List[str]:
            """Extract tokens from a document, accounting for exceptions only if needed"""
            tokens = doc_to_fixed_tokens(doc)
            return tokens
        
    return Aligner
        
english = "en_core_web_sm"

BertAligner = MakeAligner(BertTokenizer, english)
GPT2Aligner = MakeAligner(GPT2Tokenizer, english)
RobertaAligner = MakeAligner(RobertaTokenizer, english)
DistilBertAligner = MakeAligner(DistilBertTokenizer, english)
github ThilinaRajapakse / simpletransformers / simpletransformers / ner / ner_model.py View on Github external
labels (optional): A list of all Named Entity labels.  If not given, ["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] will be used.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
            cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
        """

        if labels:
            self.labels = labels
        else:
            self.labels = ["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
        self.num_labels = len(self.labels)

        if roberta_available:
            MODEL_CLASSES = {
                'bert': (BertConfig, BertForTokenClassification, BertTokenizer),
                'roberta': (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
                'distilbert': (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
                'camembert': (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer)
            }
        else:
            MODEL_CLASSES = {
                'bert': (BertConfig, BertForTokenClassification, BertTokenizer),
                'distilbert': (DistilBertConfig, DistilBertForTokenClassification, BertTokenizer),
                'camembert': (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer)
            }

        config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]

        self.model = model_class.from_pretrained(model_name, num_labels=self.num_labels)

        if use_cuda:
            if torch.cuda.is_available():
github ThilinaRajapakse / simpletransformers / simpletransformers / experimental / classification / classification_model.py View on Github external
Args:
            model_type: The type of model (bert, xlnet, xlm, roberta, distilbert, albert, camembert)
            model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
            num_labels (optional): The number of labels or classes in the dataset.
            weight (optional): A list of length num_labels containing the weights to assign to each label for loss calculation.
            sliding_window (optional): Use a sliding window when tokenizing to prevent truncating long sequences. Default = False.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
        """

        MODEL_CLASSES = {
            'bert':       (BertConfig, BertForSequenceClassification, BertTokenizer),
            'xlnet':      (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
            'xlm':        (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
            'roberta':    (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
            'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
            'albert':     (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
            'camembert':  (CamembertConfig, CamembertForSequenceClassification, CamembertTokenizer)
        }

        config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
        if num_labels:
            self.config = config_class.from_pretrained(model_name, num_labels=num_labels)
            self.num_labels = num_labels
        else:
            self.config = config_class.from_pretrained(model_name)
            self.num_labels = self.config.num_labels
        self.tokenizer = tokenizer_class.from_pretrained(model_name)
        self.num_labels = num_labels
        self.weight = weight
        self.sliding_window = sliding_window
github huggingface / transformers / examples / utils_semeval.py View on Github external
def get_input_ids(text, tokenizer):
    if isinstance(tokenizer, RobertaTokenizer):
        input_ids = tokenizer.encode(text, text_pair=None, add_special_tokens=True, add_prefix_space=True)
    else:
        input_ids = tokenizer.encode(text, text_pair=None, add_special_tokens=True)
    return input_ids
github UKPLab / sentence-transformers / sentence_transformers / models / RoBERTa.py View on Github external
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}):
        super(RoBERTa, self).__init__()
        self.config_keys = ['max_seq_length', 'do_lower_case']
        self.do_lower_case = do_lower_case

        if max_seq_length > 512:
            logging.warning("RoBERTa only allows a max_seq_length of 512 (514 with special tokens). Value will be set to 512")
            max_seq_length = 512
        self.max_seq_length = max_seq_length

        if self.do_lower_case is not None:
            tokenizer_args['do_lower_case'] = do_lower_case

        self.roberta = RobertaModel.from_pretrained(model_name_or_path, **model_args)
        self.tokenizer = RobertaTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)