Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.logger.info('Loading from checkpoint %s' % self.model_dir)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if self.device == torch.device("cpu"):
self.logger.info("RUNNING ON CPU")
else:
self.logger.info("RUNNING ON CUDA")
torch.cuda.synchronize(self.device)
self.rerank_model = AutoModelForSequenceClassification.from_pretrained(self.model_dir)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
self.rerank_model.to(self.device, non_blocking=True)
"""
self.texts = texts
self.labels = labels
self.label_dict = label_dict
self.max_seq_length = max_seq_length
if self.label_dict is None and labels is not None:
# {'class1': 0, 'class2': 1, 'class3': 2, ...}
# using this instead of `sklearn.preprocessing.LabelEncoder`
# no easily handle unknown target values
self.label_dict = dict(zip(sorted(set(labels)),
range(len(set(labels)))))
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
# suppresses tokenizer warnings
logging.getLogger(
"transformers.tokenization_utils").setLevel(logging.FATAL)
# special tokens for transformers
# in the simplest case a [CLS] token is added in the beginning
# and [SEP] token is added in the end of a piece of text
# [CLS] [SEP] .. <[PAD]>
self.sep_vid = self.tokenizer.vocab["[SEP]"]
self.cls_vid = self.tokenizer.vocab["[CLS]"]
self.pad_vid = self.tokenizer.vocab["[PAD]"]
AutoModelForSequenceClassification,
AutoTokenizer,
AdamW,
ConstantLRSchedule)
super().__init__(*args, **kwargs)
model_config = AutoConfig.from_pretrained(self.model_name, cache_dir=self.data_dir)
model_config.num_labels = 1 # set up for regression
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if self.device == "cpu":
self.logger.info("RUNNING ON CPU")
self.rerank_model = AutoModelForSequenceClassification.from_pretrained(
self.model_name,
config=model_config,
cache_dir=self.data_dir)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.data_dir)
self.rerank_model.to(self.device)
self.optimizer = AdamW(self.rerank_model.parameters(), lr=self.lr, correct_bias=False)
self.scheduler = ConstantLRSchedule(self.optimizer)
)
args = parser.parse_args()
if args.model_type != "bert":
raise ValueError(
"Only the BERT architecture is currently supported for seq2seq."
)
# Set up training device
# device = torch.device("cpu")
# Set seed
set_seed(args)
# Load pretrained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
model = Model2Model.from_pretrained(args.model_name_or_path)
# model.to(device)
logger.info("Training/evaluation parameters %s", args)
# Training
train_dataset = load_and_cache_examples(args, tokenizer)
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
# logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
def tokenizer(*args, **kwargs):
r"""
# Using torch.hub !
import torch
tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'bert-base-uncased') # Download vocabulary from S3 and cache.
tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', './test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
"""
return AutoTokenizer.from_pretrained(*args, **kwargs)
tokenizer = {'SP': SentencePieceBPETokenizer,
'BBPE': ByteLevelBPETokenizer,
'CharBPE': CharBPETokenizer,
'BERT': BertWordPieceTokenizer}[tokenizer_type]
if tokenizer_type in ['SP', 'BBPE', 'CharBPE']:
vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.json' in x][0]
merges_file = [x for x in os.listdir(tokenizer_path) if 'merges.txt' in x][0]
tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file),
merges_file=os.path.join(tokenizer_path, merges_file))
else:
vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.txt' in x][0]
tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file))
is_transformers_tokenizer = False
else:
is_transformers_tokenizer = True
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
return tokenizer, is_transformers_tokenizer
def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16):
for c, model_name in enumerate(model_names):
print(f"{c + 1} / {len(model_names)}")
config = AutoConfig.from_pretrained(model_name, torchscript=torchscript)
model = AutoModel.from_pretrained(model_name, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
max_input_size = tokenizer.max_model_input_sizes[model_name]
batch_sizes = [1, 2, 4, 8]
slice_sizes = [8, 64, 128, 256, 512, 1024]
dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
for batch_size in batch_sizes:
if fp16:
model.half()
model.to(device)
model.eval()
for slice_size in slice_sizes:
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer
# download the model or load the model path
weights_path = download_model('bert.ner', cache_dir,
process_func=_unzip_process_func,
verbose=verbose)
self.label_list = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
"I-ORG", "B-LOC", "I-LOC"]
self.model = AutoModelForTokenClassification.from_pretrained(weights_path)
self.tokenizer = AutoTokenizer.from_pretrained(weights_path)
def __init__(self, model_name, device=None, max_len=None):
super().__init__()
self.model_name = model_name
self.device = device or _device
self.log = get_logger()
do_lower_case = "uncased" in model_name
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_name, do_lower_case=do_lower_case)
# self.begin_mention_idx = self.tokenizer.convert_tokens_to_ids(
# self.BEGIN_MENTION)
if self.model_name.startswith('roberta'):
self.BEGIN_MENTION = 'madeupword0000'
self.END_MENTION = 'madeupword0001'
self.add_special_symbols = self.add_special_symbols_roberta
else:
self.BEGIN_MENTION = '[unused0]'
self.END_MENTION = '[unused1]'
self.add_special_symbols = self.add_special_symbols_bert
self.BEGIN_MENTION_IDX = self.tokenizer.convert_tokens_to_ids(
self.BEGIN_MENTION)
self.begin_mention_idx = self.BEGIN_MENTION_IDX
self.END_MENTION_IDX = self.tokenizer.convert_tokens_to_ids(
def _compute_tensorflow(model_names, dictionary, average_over, amp):
for c, model_name in enumerate(model_names):
print(f"{c + 1} / {len(model_names)}")
config = AutoConfig.from_pretrained(model_name)
model = TFAutoModel.from_pretrained(model_name, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
max_input_size = tokenizer.max_model_input_sizes[model_name]
batch_sizes = [1, 2, 4, 8]
slice_sizes = [8, 64, 128, 256, 512, 1024]
dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
print("Using model", model)
@tf.function
def inference(inputs):
return model(inputs)