Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
print(sentences_train[0], 'LABEL:', labels_train[0])
# Next we specify the pre-trained BERT model we are going to use. The
# model `"bert-base-uncased"` is the lowercased "base" model
# (12-layer, 768-hidden, 12-heads, 110M parameters).
#
# We load the used vocabulary from the BERT model, and use the BERT
# tokenizer to convert the sentences into tokens that match the data
# the BERT model was trained on.
print('Initializing BertTokenizer')
BERTMODEL='bert-base-uncased'
CACHE_DIR=os.path.join(DATADIR, 'transformers-cache')
tokenizer = BertTokenizer.from_pretrained(BERTMODEL, cache_dir=CACHE_DIR,
do_lower_case=True)
tokenized_train = [tokenizer.tokenize(s) for s in sentences_train]
tokenized_test = [tokenizer.tokenize(s) for s in sentences_test]
print ("The full tokenized first training sentence:")
print (tokenized_train[0])
# Now we set the maximum sequence lengths for our training and test
# sentences as `MAX_LEN_TRAIN` and `MAX_LEN_TEST`. The maximum length
# supported by the used BERT model is 512.
#
# The token `[SEP]` is another special token required by BERT at the
# end of the sentence.
MAX_LEN_TRAIN, MAX_LEN_TEST = 128, 512
def __init__(self, max_length, pretrain_path, blank_padding=True):
"""
Args:
max_length: max length of sentence
pretrain_path: path of pretrain model
"""
super().__init__()
self.max_length = max_length
self.blank_padding = blank_padding
self.hidden_size = 768 * 2
self.bert = BertModel.from_pretrained(pretrain_path)
self.tokenizer = BertTokenizer.from_pretrained(pretrain_path)
self.linear = nn.Linear(self.hidden_size, self.hidden_size)
def __init__(self, text_file):
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
self.model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
with open(text_file, 'r') as file:
self.passage = file.read().replace('\n', ' ')
from tpu_utils import get_tpu
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer, glue_convert_examples_to_features
from time import time
import tensorflow_datasets
print("TF version: {}".format(tf.__version__))
num_epochs = 3
max_seq_length = 128
# The number of replicas should be obtained from the get_tpu() method, but the dataset pre-processing crashes if the
# TPU is loaded beforehand
num_replicas = 8
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
data = tensorflow_datasets.load('glue/mrpc')
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_seq_length, 'mrpc')
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_seq_length, 'mrpc')
total_train_batch_size = 32
train_batch_size_per_replica = total_train_batch_size / num_replicas
train_dataset = train_dataset.batch(total_train_batch_size)
assert train_batch_size_per_replica.is_integer()
total_valid_batch_size = 64
valid_batch_size_per_replica = total_valid_batch_size / num_replicas
valid_dataset = valid_dataset.batch(total_valid_batch_size)
assert valid_batch_size_per_replica.is_integer()
print('Fetched & created dataset.')
tpu, num_replicas = get_tpu()
def __init__(self, data_dir, bert_class, params, token_pad_idx=0, tag_pad_idx=-1):
self.data_dir = data_dir
self.batch_size = params.batch_size
self.max_len = params.max_len
self.device = params.device
self.seed = params.seed
self.token_pad_idx = token_pad_idx
self.tag_pad_idx = tag_pad_idx
tags = self.load_tags()
self.tag2idx = {tag: idx for idx, tag in enumerate(tags)}
self.idx2tag = {idx: tag for idx, tag in enumerate(tags)}
params.tag2idx = self.tag2idx
params.idx2tag = self.idx2tag
self.tokenizer = BertTokenizer.from_pretrained(bert_class, do_lower_case=False)
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}):
super(BERT, self).__init__()
self.config_keys = ['max_seq_length', 'do_lower_case']
self.do_lower_case = do_lower_case
if max_seq_length > 510:
logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510")
max_seq_length = 510
self.max_seq_length = max_seq_length
if self.do_lower_case is not None:
tokenizer_args['do_lower_case'] = do_lower_case
self.bert = BertModel.from_pretrained(model_name_or_path, **model_args)
self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)
set_seed(42)
# Load original tsv file
input_tsv = load_tsv(args.input)
if not args.no_augment:
sentences = [spacy_en(text) for text, _ in tqdm(input_tsv, desc="Loading dataset")]
# build lists of words indexes by POS tab
pos_dict = build_pos_dict(sentences)
# Generate augmented samples
sentences = augmentation(sentences, pos_dict)
else:
sentences = [text for text, _ in input_tsv]
# Load teacher model
model = BertForSequenceClassification.from_pretrained(args.model).to(device)
tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=True)
# Assign labels with teacher
teacher_field = data.Field(sequential=True, tokenize=tokenizer.tokenize, lower=True, include_lengths=True, batch_first=True)
fields = [("text", teacher_field)]
if not args.no_augment:
examples = [data.Example.fromlist([" ".join(words)], fields) for words in sentences]
else:
examples = [data.Example.fromlist([text], fields) for text in sentences]
augmented_dataset = data.Dataset(examples, fields)
teacher_field.vocab = BertVocab(tokenizer.vocab)
new_labels = BertTrainer(model, device, batch_size=args.batch_size).infer(augmented_dataset)
# Write to file
with open(args.output, "w") as f:
f.write("sentence\tscores\n")
for sentence, rating in zip(sentences, new_labels):
#!/usr/bin/env python3
from transformers import BertTokenizer, BertForNextSentencePrediction
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
#prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
#next_sentence = "The sky is blue due to the shorter wavelength of blue light."
prompt = "that's great."
next_sentence = "that is great"
encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
print(logits)
#assert logits[0, 0] < logits[0, 1] # next sentence was random
def evaluate(args):
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
model = BertAbs.from_pretrained("bertabs-finetuned-cnndm")
model.to(args.device)
model.eval()
symbols = {
"BOS": tokenizer.vocab["[unused0]"],
"EOS": tokenizer.vocab["[unused1]"],
"PAD": tokenizer.vocab["[PAD]"],
}
if args.compute_rouge:
reference_summaries = []
generated_summaries = []
import rouge
import nltk
def main():
parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
parser.add_argument('--file_path', type=str, default='data/dump.txt',
help='The path to the data.')
parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2', 'kobert'])
parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
help="The tokenizer to use.")
parser.add_argument('--dump_file', type=str, default='data/dump',
help='The dump file prefix.')
args = parser.parse_args()
logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
if args.tokenizer_type == 'bert':
tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
bos = tokenizer.special_tokens_map['cls_token'] # `[CLS]`
sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]`
elif args.tokenizer_type == 'roberta':
tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
bos = tokenizer.special_tokens_map['cls_token'] # `<s>`
sep = tokenizer.special_tokens_map['sep_token'] # `</s>`
elif args.tokenizer_type == 'gpt2':
tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
bos = tokenizer.special_tokens_map['bos_token'] # `<|endoftext|>`
sep = tokenizer.special_tokens_map['eos_token'] # `<|endoftext|>`
elif args.tokenizer_type == 'kobert':
tokenizer = KoBertTokenizer.from_pretrained('kobert')
bos = tokenizer.special_tokens_map['cls_token']
sep = tokenizer.special_tokens_map['sep_token']
logger.info(f'Loading text from {args.file_path}')