Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
try:
from torch.utils.tensorboard import SummaryWriter
except ImportError:
from tensorboardX import SummaryWriter
logger = logging.getLogger(__name__)
MODEL_CLASSES = {
"gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
"openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
"bert": (BertConfig, BertForMaskedLM, BertTokenizer),
"roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
"distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
"camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
}
class TextDataset(Dataset):
def __init__(self, tokenizer, args, file_path="train", block_size=512):
assert os.path.isfile(file_path)
directory, filename = os.path.split(file_path)
cached_features_file = os.path.join(
directory, args.model_name_or_path + "_cached_lm_" + str(block_size) + "_" + filename
)
if os.path.exists(cached_features_file) and not args.overwrite_cache:
logger.info("Loading features from cached file %s", cached_features_file)
with open(cached_features_file, "rb") as handle:
self.examples = pickle.load(handle)
logger = logging.getLogger(__name__)
ALL_MODELS = sum(
(
tuple(conf.pretrained_config_archive_map.keys())
for conf in (BertConfig, RobertaConfig, DistilBertConfig, CamembertConfig, XLMRobertaConfig)
),
(),
)
MODEL_CLASSES = {
"bert": (BertConfig, BertForTokenClassification, BertTokenizer),
"roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
"distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
"camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer),
"xlmroberta": (XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer),
}
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
""" Train the model """
if args.local_rank in [-1, 0]:
by offsetting and accounting for special tokens."""
offset = 0
output = []
for segment in segments:
if segment:
offset += 1
seen = set()
for idx_group in segment:
output.append([idx + offset for idx in idx_group])
seen.update({idx for idx in idx_group})
offset += len(seen)
return output
class SerializableDistilBertTokenizer(
transformers.DistilBertTokenizer, SerializationMixin
):
serialization_fields = list(BASE_CLASS_FIELDS) + [
"vocab",
"do_basic_tokenize",
"do_lower_case",
"never_split",
"tokenize_chinese_chars",
]
@classmethod
def blank(cls):
self = cls.__new__(cls)
for field in self.serialization_fields:
setattr(self, field, None)
self.ids_to_tokens = None
self.basic_tokenizer = None
# The follwing import is the official SQuAD evaluation script (2.0).
# You can remove it from the dependencies if you are using this script outside of the library
# We've added it here for automated tests (see examples/test_examples.py file)
from ..utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
logger = logging.getLogger(__name__)
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
MODEL_CLASSES = {
'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
}
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
def to_list(tensor):
return tensor.detach().cpu().tolist()
def train(args, train_dataset, model, tokenizer, teacher=None):
""" Train the model """
if args.local_rank in [-1, 0]:
tb_writer = SummaryWriter()
# Possible: speed, balance, accuracy
self.aim = aim
if self.aim == ENCODER_AIM.SPEED:
# uses more memory, takes very long to train and outputs weird debugging statements to the command line,
# consider waiting until it gets better or try to investigate why this happens
# (changing the pretrained model doesn't seem to help)
self._classifier_model_class = AlbertForSequenceClassification
self._embeddings_model_class = AlbertModel
self._tokenizer_class = AlbertTokenizer
self._pretrained_model_name = 'albert-base-v2'
self._model_max_len = 768
if self.aim == ENCODER_AIM.BALANCE:
self._classifier_model_class = DistilBertForSequenceClassification
self._embeddings_model_class = DistilBertModel
self._tokenizer_class = DistilBertTokenizer
self._pretrained_model_name = 'distilbert-base-uncased'
self._model_max_len = 768
if self.aim == ENCODER_AIM.ACCURACY:
self._classifier_model_class = DistilBertForSequenceClassification
self._embeddings_model_class = DistilBertModel
self._tokenizer_class = DistilBertTokenizer
self._pretrained_model_name = 'distilbert-base-uncased'
self._model_max_len = 768
self.device, _ = get_devices()
def __init__(self, model_path='distilbert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda'):
super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p)
self.model_path = model_path
self.tokenizer = DistilBertTokenizer.from_pretrained(model_path)
self.model = DistilBertForMaskedLM.from_pretrained(model_path)
self.model.to(self.device)
self.model.eval()
# Functional version that works with dictionaries
return [meta_token.assoc("token", b) for b in bpe_tokens[offset:]]
def _doc_to_fixed_tokens(self, doc: SpacyDoc) -> List[str]:
"""Extract tokens from a document, accounting for exceptions only if needed"""
tokens = doc_to_fixed_tokens(doc)
return tokens
return Aligner
english = "en_core_web_sm"
BertAligner = MakeAligner(BertTokenizer, english)
GPT2Aligner = MakeAligner(GPT2Tokenizer, english)
RobertaAligner = MakeAligner(RobertaTokenizer, english)
DistilBertAligner = MakeAligner(DistilBertTokenizer, english)
RobertaForSequenceClassification,
RobertaTokenizer,
DistilBertConfig,
DistilBertForSequenceClassification,
DistilBertTokenizer,
)
MODEL_CLASSES = {
"bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
"xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
"xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
"roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
"distilbert": (
DistilBertConfig,
DistilBertForSequenceClassification,
DistilBertTokenizer,
),
}
# Create text corpus suitable for language model training
def create_corpus(text_list, target_path, logger=None):
nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner", "textcat"])
with open(target_path, "w") as f:
# Split sentences for each document
logger.info("Formatting corpus for {}".format(target_path))
for text in progress_bar(text_list):
text = fix_html(text)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.model = DistilBertForQuestionAnswering.from_pretrained(self.model_dir)
self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_dir)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
"""
Initializes a MultiLabelClassification model.
Args:
model_type: The type of model (bert, roberta)
model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
num_labels (optional): The number of labels or classes in the dataset.
args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
"""
MODEL_CLASSES = {
'bert': (BertConfig, BertForMultiLabelSequenceClassification, BertTokenizer),
'roberta': (RobertaConfig, RobertaForMultiLabelSequenceClassification, RobertaTokenizer),
'xlnet': (XLNetConfig, XLNetForMultiLabelSequenceClassification, XLNetTokenizer),
'xlm': (XLMConfig, XLMForMultiLabelSequenceClassification, XLMTokenizer),
'distilbert': (DistilBertConfig, DistilBertForMultiLabelSequenceClassification, DistilBertTokenizer),
}
config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
self.tokenizer = tokenizer_class.from_pretrained(model_name)
self.model = model_class.from_pretrained(model_name, num_labels=num_labels)
self.num_labels = num_labels
if use_cuda:
if torch.cuda.is_available():
self.device = torch.device("cuda")
else:
raise ValueError("'use_cuda' set to True when cuda is unavailable. Make sure CUDA is available or set use_cuda=False.")
else:
self.device = "cpu"
self.results = {}