How to use the transformers.BertTokenizer.from_pretrained function in transformers

To help you get started, we’ve selected a few transformers examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github csc-training / intro-to-dl / day2 / pytorch_20ng_bert.py View on Github external
print(sentences_train[0], 'LABEL:', labels_train[0])

# Next we specify the pre-trained BERT model we are going to use. The
# model `"bert-base-uncased"` is the lowercased "base" model
# (12-layer, 768-hidden, 12-heads, 110M parameters).
#
# We load the used vocabulary from the BERT model, and use the BERT
# tokenizer to convert the sentences into tokens that match the data
# the BERT model was trained on.

print('Initializing BertTokenizer')

BERTMODEL='bert-base-uncased'
CACHE_DIR=os.path.join(DATADIR, 'transformers-cache')

tokenizer = BertTokenizer.from_pretrained(BERTMODEL, cache_dir=CACHE_DIR,
                                          do_lower_case=True)

tokenized_train = [tokenizer.tokenize(s) for s in sentences_train]
tokenized_test  = [tokenizer.tokenize(s) for s in sentences_test]

print ("The full tokenized first training sentence:")
print (tokenized_train[0])

# Now we set the maximum sequence lengths for our training and test
# sentences as `MAX_LEN_TRAIN` and `MAX_LEN_TEST`. The maximum length
# supported by the used BERT model is 512.
#
# The token `[SEP]` is another special token required by BERT at the
# end of the sentence.

MAX_LEN_TRAIN, MAX_LEN_TEST = 128, 512
github thunlp / OpenNRE / opennre / encoder / bert_encoder.py View on Github external
def __init__(self, max_length, pretrain_path, blank_padding=True):
        """
        Args:
            max_length: max length of sentence
            pretrain_path: path of pretrain model
        """
        super().__init__()
        self.max_length = max_length
        self.blank_padding = blank_padding
        self.hidden_size = 768 * 2
        self.bert = BertModel.from_pretrained(pretrain_path)
        self.tokenizer = BertTokenizer.from_pretrained(pretrain_path)
        self.linear = nn.Linear(self.hidden_size, self.hidden_size)
github jetnew / DrFAQ / nlp / qa.py View on Github external
def __init__(self, text_file):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

        with open(text_file, 'r') as file:
            self.passage = file.read().replace('\n', ' ')
github huggingface / transformers / examples / TPU / run_tpu_glue_fit.py View on Github external
from tpu_utils import get_tpu
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer, glue_convert_examples_to_features
from time import time
import tensorflow_datasets
print("TF version: {}".format(tf.__version__))

num_epochs = 3
max_seq_length = 128

# The number of replicas should be obtained from the get_tpu() method, but the dataset pre-processing crashes if the
# TPU is loaded beforehand
num_replicas = 8

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
data = tensorflow_datasets.load('glue/mrpc')
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_seq_length, 'mrpc')
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_seq_length, 'mrpc')

total_train_batch_size = 32
train_batch_size_per_replica = total_train_batch_size / num_replicas
train_dataset = train_dataset.batch(total_train_batch_size)
assert train_batch_size_per_replica.is_integer()

total_valid_batch_size = 64
valid_batch_size_per_replica = total_valid_batch_size / num_replicas
valid_dataset = valid_dataset.batch(total_valid_batch_size)
assert valid_batch_size_per_replica.is_integer()
print('Fetched & created dataset.')

tpu, num_replicas = get_tpu()
github weizhepei / BERT-NER / data_loader.py View on Github external
def __init__(self, data_dir, bert_class, params, token_pad_idx=0, tag_pad_idx=-1):
        self.data_dir = data_dir
        self.batch_size = params.batch_size
        self.max_len = params.max_len
        self.device = params.device
        self.seed = params.seed
        self.token_pad_idx = token_pad_idx
        self.tag_pad_idx = tag_pad_idx

        tags = self.load_tags()
        self.tag2idx = {tag: idx for idx, tag in enumerate(tags)}
        self.idx2tag = {idx: tag for idx, tag in enumerate(tags)}
        params.tag2idx = self.tag2idx
        params.idx2tag = self.idx2tag

        self.tokenizer = BertTokenizer.from_pretrained(bert_class, do_lower_case=False)
github UKPLab / sentence-transformers / sentence_transformers / models / BERT.py View on Github external
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}):
        super(BERT, self).__init__()
        self.config_keys = ['max_seq_length', 'do_lower_case']
        self.do_lower_case = do_lower_case

        if max_seq_length > 510:
            logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510")
            max_seq_length = 510
        self.max_seq_length = max_seq_length

        if self.do_lower_case is not None:
            tokenizer_args['do_lower_case'] = do_lower_case

        self.bert = BertModel.from_pretrained(model_name_or_path, **model_args)
        self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)
github tacchinotacchi / distil-bilstm / generate_dataset.py View on Github external
set_seed(42)
    
    # Load original tsv file
    input_tsv = load_tsv(args.input)
    if not args.no_augment:
        sentences = [spacy_en(text) for text, _ in tqdm(input_tsv, desc="Loading dataset")]
        # build lists of words indexes by POS tab
        pos_dict = build_pos_dict(sentences)
        # Generate augmented samples
        sentences = augmentation(sentences, pos_dict)
    else:
        sentences = [text for text, _ in input_tsv]

    # Load teacher model
    model = BertForSequenceClassification.from_pretrained(args.model).to(device)
    tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=True)

    # Assign labels with teacher
    teacher_field = data.Field(sequential=True, tokenize=tokenizer.tokenize, lower=True, include_lengths=True, batch_first=True)
    fields = [("text", teacher_field)]
    if not args.no_augment:
        examples = [data.Example.fromlist([" ".join(words)], fields) for words in sentences]
    else:
        examples = [data.Example.fromlist([text], fields) for text in sentences]
    augmented_dataset = data.Dataset(examples, fields)
    teacher_field.vocab = BertVocab(tokenizer.vocab)
    new_labels = BertTrainer(model, device, batch_size=args.batch_size).infer(augmented_dataset)

    # Write to file
    with open(args.output, "w") as f:
        f.write("sentence\tscores\n")
        for sentence, rating in zip(sentences, new_labels):
github radiodee1 / awesome-chatbot / classifier / run_torch_bert_transformers.py View on Github external
#!/usr/bin/env python3

from transformers import BertTokenizer, BertForNextSentencePrediction
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

#prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
#next_sentence = "The sky is blue due to the shorter wavelength of blue light."

prompt = "that's great."
next_sentence = "that is great"

encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))

print(logits)
#assert logits[0, 0] < logits[0, 1] # next sentence was random
github huggingface / transformers / examples / summarization / run_summarization.py View on Github external
def evaluate(args):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
    model = BertAbs.from_pretrained("bertabs-finetuned-cnndm")
    model.to(args.device)
    model.eval()

    symbols = {
        "BOS": tokenizer.vocab["[unused0]"],
        "EOS": tokenizer.vocab["[unused1]"],
        "PAD": tokenizer.vocab["[PAD]"],
    }

    if args.compute_rouge:
        reference_summaries = []
        generated_summaries = []

        import rouge
        import nltk
github monologg / DistilKoBERT / distillation / scripts / binarized_data.py View on Github external
def main():
    parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
    parser.add_argument('--file_path', type=str, default='data/dump.txt',
                        help='The path to the data.')
    parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2', 'kobert'])
    parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
                        help="The tokenizer to use.")
    parser.add_argument('--dump_file', type=str, default='data/dump',
                        help='The dump file prefix.')
    args = parser.parse_args()

    logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
    if args.tokenizer_type == 'bert':
        tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
        bos = tokenizer.special_tokens_map['cls_token']  # `[CLS]`
        sep = tokenizer.special_tokens_map['sep_token']  # `[SEP]`
    elif args.tokenizer_type == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
        bos = tokenizer.special_tokens_map['cls_token']  # `<s>`
        sep = tokenizer.special_tokens_map['sep_token']  # `</s>`
    elif args.tokenizer_type == 'gpt2':
        tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
        bos = tokenizer.special_tokens_map['bos_token']  # `&lt;|endoftext|&gt;`
        sep = tokenizer.special_tokens_map['eos_token']  # `&lt;|endoftext|&gt;`
    elif args.tokenizer_type == 'kobert':
        tokenizer = KoBertTokenizer.from_pretrained('kobert')
        bos = tokenizer.special_tokens_map['cls_token']
        sep = tokenizer.special_tokens_map['sep_token']

    logger.info(f'Loading text from {args.file_path}')