Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def setUp(self):
self.model_tester = XLNetModelTest.XLNetModelTester(self)
self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
"""
Initializes a ClassificationModel model.
Args:
model_type: The type of model (bert, xlnet, xlm, roberta, distilbert)
model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
num_labels (optional): The number of labels or classes in the dataset.
weight (optional): A list of length num_labels containing the weights to assign to each label for loss calculation.
args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
"""
MODEL_CLASSES = {
'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
'camembert': (CamembertConfig, CamembertForSequenceClassification, CamembertTokenizer)
}
config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
if num_labels:
self.config = config_class.from_pretrained(model_name, num_labels=num_labels)
self.num_labels = num_labels
else:
self.config = config_class.from_pretrained(model_name)
self.num_labels = self.config.num_labels
self.weight = weight
RobertaForSequenceClassification,
RobertaTokenizer,
CamembertConfig,
CamembertForSequenceClassification,
CamembertTokenizer,
AlbertConfig,
AlbertForSequenceClassification,
AlbertTokenizer,
DistilBertConfig,
DistilBertForSequenceClassification,
DistilBertTokenizer,
)
MODEL_CLASSES = {
"bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
"xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
"xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
"roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
"albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
"distilbert": (
DistilBertConfig,
DistilBertForSequenceClassification,
DistilBertTokenizer,
),
"camembert": (
CamembertConfig,
CamembertForSequenceClassification,
CamembertTokenizer,
),
}
"""
Initializes a ClassificationModel model.
Args:
model_type: The type of model (bert, xlnet, xlm, roberta, distilbert, albert, camembert)
model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
num_labels (optional): The number of labels or classes in the dataset.
weight (optional): A list of length num_labels containing the weights to assign to each label for loss calculation.
sliding_window (optional): Use a sliding window when tokenizing to prevent truncating long sequences. Default = False.
args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
"""
MODEL_CLASSES = {
'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
'camembert': (CamembertConfig, CamembertForSequenceClassification, CamembertTokenizer)
}
config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
if num_labels:
self.config = config_class.from_pretrained(model_name, num_labels=num_labels)
self.num_labels = num_labels
else:
self.config = config_class.from_pretrained(model_name)
self.num_labels = self.config.num_labels
self.tokenizer = tokenizer_class.from_pretrained(model_name)
self.num_labels = num_labels
DistilBertTokenizer,
)
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
MODEL_CLASSES = {
"bert": (
BertConfig,
(BertForSequenceClassification, BertForMultiLabelSequenceClassification),
BertTokenizer,
),
"xlnet": (
XLNetConfig,
(XLNetForSequenceClassification, XLNetForMultiLabelSequenceClassification),
XLNetTokenizer,
),
"xlm": (
XLMConfig,
(XLMForSequenceClassification, XLMForSequenceClassification),
XLMTokenizer,
),
"roberta": (
RobertaConfig,
(RobertaForSequenceClassification, RobertaForMultiLabelSequenceClassification),
RobertaTokenizer,
),
"distilbert": (
DistilBertConfig,
(
XLMForSequenceClassification,
XLMTokenizer,
XLNetConfig,
XLNetForSequenceClassification,
XLNetTokenizer,
RobertaConfig,
RobertaForSequenceClassification,
RobertaTokenizer,
DistilBertConfig,
DistilBertForSequenceClassification,
DistilBertTokenizer,
)
MODEL_CLASSES = {
"bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
"xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
"xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
"roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
"distilbert": (
DistilBertConfig,
DistilBertForSequenceClassification,
DistilBertTokenizer,
),
}
# Create text corpus suitable for language model training
def create_corpus(text_list, target_path, logger=None):
nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner", "textcat"])
)
import argparse
from seqeval import metrics
from fastprogress import master_bar, progress_bar
from transformers import create_optimizer, GradientAccumulator
from transformers import squad_convert_examples_to_features, SquadV1Processor, SquadV2Processor
import datetime
import math
logger = logging.getLogger(__name__)
ALL_MODELS = sum(
(
tuple(conf.pretrained_config_archive_map.keys())
for conf in (BertConfig, XLNetConfig, XLMConfig, DistilBertConfig)
),
(),
)
MODEL_CLASSES = {
"bert": (BertConfig, TFBertForQuestionAnswering, BertTokenizer),
"xlnet": (XLNetConfig, TFXLNetForQuestionAnsweringSimple, XLNetTokenizer),
"xlm": (XLMConfig, TFXLMForQuestionAnsweringSimple, XLMTokenizer),
"distilbert": (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertTokenizer),
}
def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, train_batch_size):
if args.max_steps > 0:
num_train_steps = args.max_steps * args.gradient_accumulation_steps
args.num_train_epochs = 1
None, None,
None, None, None,
None, None, None,
None, None)
import logging
logging.basicConfig(level=logging.INFO)
MODEL_CLASSES = {
'bert': (BertConfig, TFBertForPreTraining, BertForPreTraining, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
'bert-large-uncased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
'bert-large-cased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
'bert-base-cased-finetuned-mrpc': (BertConfig, TFBertForSequenceClassification, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
'gpt2': (GPT2Config, TFGPT2LMHeadModel, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP),
'xlnet': (XLNetConfig, TFXLNetLMHeadModel, XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP),
'xlm': (XLMConfig, TFXLMWithLMHeadModel, XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP),
'transfo-xl': (TransfoXLConfig, TFTransfoXLLMHeadModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP),
'openai-gpt': (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP),
'roberta': (RobertaConfig, TFRobertaForMaskedLM, RobertaForMaskedLM, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP)
}
def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
if model_type not in MODEL_CLASSES:
raise ValueError("Unrecognized model type, should be one of {}.".format(list(MODEL_CLASSES.keys())))
config_class, model_class, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
RawResult, write_predictions,
RawResultExtended, write_predictions_extended)
# The follwing import is the official SQuAD evaluation script (2.0).
# You can remove it from the dependencies if you are using this script outside of the library
# We've added it here for automated tests (see examples/test_examples.py file)
from ..utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
logger = logging.getLogger(__name__)
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
MODEL_CLASSES = {
'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
}
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
def to_list(tensor):
return tensor.detach().cpu().tolist()
def train(args, train_dataset, model, tokenizer, teacher=None):
""" Train the model """