Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
do_lower_case=True
)
assert type(tokenizer) == BertTokenizer
assert tokenizer.basic_tokenizer.do_lower_case == True
tokenizer = Tokenizer.load(
pretrained_model_name_or_path="xlnet-base-cased",
do_lower_case=True
)
assert type(tokenizer) == XLNetTokenizer
assert tokenizer.do_lower_case == True
tokenizer = Tokenizer.load(
pretrained_model_name_or_path="roberta-base"
)
assert type(tokenizer) == RobertaTokenizer
try:
from torch.utils.tensorboard import SummaryWriter
except ImportError:
from tensorboardX import SummaryWriter
logger = logging.getLogger(__name__)
MODEL_CLASSES = {
"gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
"openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
"bert": (BertConfig, BertForMaskedLM, BertTokenizer),
"roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
"distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
"camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
}
class TextDataset(Dataset):
def __init__(self, tokenizer, args, file_path="train", block_size=512):
assert os.path.isfile(file_path)
directory, filename = os.path.split(file_path)
cached_features_file = os.path.join(
directory, args.model_name_or_path + "_cached_lm_" + str(block_size) + "_" + filename
)
if os.path.exists(cached_features_file) and not args.overwrite_cache:
logger.info("Loading features from cached file %s", cached_features_file)
with open(cached_features_file, "rb") as handle:
def benchmark_mnli(samples):
torch_hub_model = time_fn(torch.hub.load, 'pytorch/fairseq','roberta.large.mnli')
try:
transformers_model = time_fn(transformers.RobertaModel.from_pretrained,
'roberta-large-mnli')
except:
transformers_model = time_fn(transformers.RobertaModel.from_pretrained,
'roberta-large-mnli', force_download=True)
transformers_tokenizer = time_fn(transformers.RobertaTokenizer.from_pretrained, 'roberta-large-mnli')
pred_functions = {
'transformers' : predict_transformers(transformers_model, transformers_tokenizer),
'torch_hub' : predict_roberta(torch_hub_model)
}
for framework, pred_fn in pred_functions.items():
print(f'Benchmarking {framework} with {samples} samples')
time_fn(benchmark, pred_fn, samples)
def __init__(self, model_type, model_name, num_labels=None, pos_weight=None, args=None, use_cuda=True):
"""
Initializes a MultiLabelClassification model.
Args:
model_type: The type of model (bert, roberta)
model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
num_labels (optional): The number of labels or classes in the dataset.
pos_weight (optional): A list of length num_labels containing the weights to assign to each label for loss calculation.
args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
"""
MODEL_CLASSES = {
'bert': (BertConfig, BertForMultiLabelSequenceClassification, BertTokenizer),
'roberta': (RobertaConfig, RobertaForMultiLabelSequenceClassification, RobertaTokenizer),
'xlnet': (XLNetConfig, XLNetForMultiLabelSequenceClassification, XLNetTokenizer),
'xlm': (XLMConfig, XLMForMultiLabelSequenceClassification, XLMTokenizer),
'distilbert': (DistilBertConfig, DistilBertForMultiLabelSequenceClassification, DistilBertTokenizer),
'albert': (AlbertConfig, AlbertForMultiLabelSequenceClassification, AlbertTokenizer)
}
config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
if num_labels:
self.config = config_class.from_pretrained(model_name, num_labels=num_labels)
self.num_labels = num_labels
else:
self.config = config_class.from_pretrained(model_name)
self.num_labels = self.config.num_labels
self.tokenizer = tokenizer_class.from_pretrained(model_name)
self.tokenizer = tokenizer_class.from_pretrained(model_name)
self.num_labels = num_labels
def __init__(self, model_type, model_name, num_labels=2, args=None, use_cuda=True):
"""
Initializes a MultiLabelClassification model.
Args:
model_type: The type of model (bert, roberta)
model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
num_labels (optional): The number of labels or classes in the dataset.
args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
"""
MODEL_CLASSES = {
'bert': (BertConfig, BertForMultiLabelSequenceClassification, BertTokenizer),
'roberta': (RobertaConfig, RobertaForMultiLabelSequenceClassification, RobertaTokenizer),
'xlnet': (XLNetConfig, XLNetForMultiLabelSequenceClassification, XLNetTokenizer),
'xlm': (XLMConfig, XLMForMultiLabelSequenceClassification, XLMTokenizer),
'distilbert': (DistilBertConfig, DistilBertForMultiLabelSequenceClassification, DistilBertTokenizer),
}
config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
self.tokenizer = tokenizer_class.from_pretrained(model_name)
self.model = model_class.from_pretrained(model_name, num_labels=num_labels)
self.num_labels = num_labels
if use_cuda:
if torch.cuda.is_available():
self.device = torch.device("cuda")
else:
raise ValueError("'use_cuda' set to True when cuda is unavailable. Make sure CUDA is available or set use_cuda=False.")
else:
# Functional version that works with dictionaries
return [meta_token.assoc("token", b) for b in bpe_tokens[offset:]]
def _doc_to_fixed_tokens(self, doc: SpacyDoc) -> List[str]:
"""Extract tokens from a document, accounting for exceptions only if needed"""
tokens = doc_to_fixed_tokens(doc)
return tokens
return Aligner
english = "en_core_web_sm"
BertAligner = MakeAligner(BertTokenizer, english)
GPT2Aligner = MakeAligner(GPT2Tokenizer, english)
RobertaAligner = MakeAligner(RobertaTokenizer, english)
DistilBertAligner = MakeAligner(DistilBertTokenizer, english)
labels (optional): A list of all Named Entity labels. If not given, ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] will be used.
args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
"""
if labels:
self.labels = labels
else:
self.labels = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
self.num_labels = len(self.labels)
if roberta_available:
MODEL_CLASSES = {
'bert': (BertConfig, BertForTokenClassification, BertTokenizer),
'roberta': (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
'distilbert': (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
'camembert': (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer)
}
else:
MODEL_CLASSES = {
'bert': (BertConfig, BertForTokenClassification, BertTokenizer),
'distilbert': (DistilBertConfig, DistilBertForTokenClassification, BertTokenizer),
'camembert': (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer)
}
config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
self.model = model_class.from_pretrained(model_name, num_labels=self.num_labels)
if use_cuda:
if torch.cuda.is_available():
Args:
model_type: The type of model (bert, xlnet, xlm, roberta, distilbert, albert, camembert)
model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
num_labels (optional): The number of labels or classes in the dataset.
weight (optional): A list of length num_labels containing the weights to assign to each label for loss calculation.
sliding_window (optional): Use a sliding window when tokenizing to prevent truncating long sequences. Default = False.
args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
"""
MODEL_CLASSES = {
'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
'camembert': (CamembertConfig, CamembertForSequenceClassification, CamembertTokenizer)
}
config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
if num_labels:
self.config = config_class.from_pretrained(model_name, num_labels=num_labels)
self.num_labels = num_labels
else:
self.config = config_class.from_pretrained(model_name)
self.num_labels = self.config.num_labels
self.tokenizer = tokenizer_class.from_pretrained(model_name)
self.num_labels = num_labels
self.weight = weight
self.sliding_window = sliding_window
def get_input_ids(text, tokenizer):
if isinstance(tokenizer, RobertaTokenizer):
input_ids = tokenizer.encode(text, text_pair=None, add_special_tokens=True, add_prefix_space=True)
else:
input_ids = tokenizer.encode(text, text_pair=None, add_special_tokens=True)
return input_ids
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}):
super(RoBERTa, self).__init__()
self.config_keys = ['max_seq_length', 'do_lower_case']
self.do_lower_case = do_lower_case
if max_seq_length > 512:
logging.warning("RoBERTa only allows a max_seq_length of 512 (514 with special tokens). Value will be set to 512")
max_seq_length = 512
self.max_seq_length = max_seq_length
if self.do_lower_case is not None:
tokenizer_args['do_lower_case'] = do_lower_case
self.roberta = RobertaModel.from_pretrained(model_name_or_path, **model_args)
self.tokenizer = RobertaTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)