How to use the transformers.BertTokenizer function in transformers

To help you get started, we’ve selected a few transformers examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github huggingface / transformers / examples / run_lm_finetuning.py View on Github external
)


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter


logger = logging.getLogger(__name__)


MODEL_CLASSES = {
    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
    "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
    "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
}


class TextDataset(Dataset):
    def __init__(self, tokenizer, args, file_path="train", block_size=512):
        assert os.path.isfile(file_path)
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, args.model_name_or_path + "_cached_lm_" + str(block_size) + "_" + filename
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
github sz128 / slot_filling_and_intent_detection_of_SLU / scripts / slot_tagging_and_intent_detection_with_elmo_and_transformer.py View on Github external
from allennlp.modules.elmo import Elmo, batch_to_ids

import models.slot_tagger as slot_tagger
import models.slot_tagger_with_focus as slot_tagger_with_focus
import models.slot_tagger_crf as slot_tagger_with_crf
import models.snt_classifier as snt_classifier

import utils.vocab_reader as vocab_reader
import utils.data_reader_for_elmo as data_reader
import utils.read_wordEmb as read_wordEmb
import utils.util as util
import utils.acc as acc

MODEL_CLASSES = {
        'bert': (BertModel, BertTokenizer),
        'xlnet': (XLNetModel, XLNetTokenizer),
        }

parser = argparse.ArgumentParser()
parser.add_argument('--task_st', required=True, help='slot filling task: slot_tagger | slot_tagger_with_focus | slot_tagger_with_crf')
parser.add_argument('--task_sc', required=True, help='intent detection task: none | 2tails | maxPooling | hiddenCNN | hiddenAttention')
parser.add_argument('--sc_type', default='single_cls_CE', help='single_cls_CE | multi_cls_BCE')
parser.add_argument('--st_weight', type=float, default=0.5, help='loss weight for slot tagging task, ranging from 0 to 1.')

parser.add_argument('--dataset', required=True, help='atis-2 | snips')
parser.add_argument('--dataroot', required=True, help='path to dataset')
parser.add_argument('--save_model', default='model', help='save model to this file')
#parser.add_argument('--mini_word_freq', type=int, default=2, help='mini_word_freq in the training data')
#parser.add_argument('--word_lowercase', action='store_true', help='word lowercase')
parser.add_argument('--bos_eos', action='store_true', help='Whether to add <s> and </s> to the input sentence (default is not)')
parser.add_argument('--save_vocab', default='vocab', help='save vocab to this file')
github botfront / rasa-for-botfront / rasa / nlu / utils / hugging_face / registry.py View on Github external
gpt2_tokens_cleaner,
    xlnet_tokens_cleaner,
)


model_class_dict = {
    "bert": TFBertModel,
    "gpt": TFOpenAIGPTModel,
    "gpt2": TFGPT2Model,
    "xlnet": TFXLNetModel,
    # "xlm": TFXLMModel, # Currently doesn't work because of a bug in transformers library https://github.com/huggingface/transformers/issues/2729
    "distilbert": TFDistilBertModel,
    "roberta": TFRobertaModel,
}
model_tokenizer_dict = {
    "bert": BertTokenizer,
    "gpt": OpenAIGPTTokenizer,
    "gpt2": GPT2Tokenizer,
    "xlnet": XLNetTokenizer,
    # "xlm": XLMTokenizer,
    "distilbert": DistilBertTokenizer,
    "roberta": RobertaTokenizer,
}
model_weights_defaults = {
    "bert": "bert-base-uncased",
    "gpt": "openai-gpt",
    "gpt2": "gpt2",
    "xlnet": "xlnet-base-cased",
    # "xlm": "xlm-mlm-enfr-1024",
    "distilbert": "distilbert-base-uncased",
    "roberta": "roberta-base",
}
github huggingface / transformers / examples / run_xnli.py View on Github external
try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter


logger = logging.getLogger(__name__)

ALL_MODELS = sum(
    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, DistilBertConfig, XLMConfig)), ()
)

MODEL_CLASSES = {
    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
}


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
github zysite / biaffine-parser / parser / cmds / cmd.py View on Github external
def __call__(self, args):
        self.args = args
        if not os.path.exists(args.file):
            os.mkdir(args.file)
        if not os.path.exists(args.fields) or args.preprocess:
            print("Preprocess the data")
            self.WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=True)
            if args.feat == 'char':
                self.FEAT = CharField('chars', pad=pad, unk=unk, bos=bos,
                                      fix_len=args.fix_len, tokenize=list)
            elif args.feat == 'bert':
                tokenizer = BertTokenizer.from_pretrained(args.bert_model)
                self.FEAT = BertField('bert', pad='[PAD]', bos='[CLS]',
                                      tokenize=tokenizer.encode)
            else:
                self.FEAT = Field('tags', bos=bos)
            self.HEAD = Field('heads', bos=bos, use_vocab=False, fn=int)
            self.REL = Field('rels', bos=bos)
            if args.feat in ('char', 'bert'):
                self.fields = CoNLL(FORM=(self.WORD, self.FEAT),
                                    HEAD=self.HEAD, DEPREL=self.REL)
            else:
                self.fields = CoNLL(FORM=self.WORD, CPOS=self.FEAT,
                                    HEAD=self.HEAD, DEPREL=self.REL)

            train = Corpus.load(args.ftrain, self.fields)
            if args.fembed:
                embed = Embedding.load(args.fembed, args.unk)
github sassoftware / python-dlpy / dlpy / transformers / bert_model.py View on Github external
if num_hidden_layers &gt; tmp_config['num_hidden_layers']:
                raise DLPyError('You specified more hidden layers than are available in '
                                'the base BERT model.')

            self._base_model = BertModel.from_pretrained(name,
                                                         cache_dir=cache_dir,
                                                         num_hidden_layers=num_hidden_layers)
            self._config = BertConfig.from_pretrained(name,
                                                      cache_dir=cache_dir,
                                                      num_hidden_layers=num_hidden_layers).to_dict()

        if self._verbose:
            print("NOTE: base BERT model loaded.")

        # instantiate BERT _tokenizer
        self._tokenizer = BertTokenizer.from_pretrained(name)
        
        # load embedding table (token | position | segment)
        self._load_embedding_table()
            
        if self._config['max_position_embeddings'] &lt; max_seq_len:
            raise DLPyError('The specified maximum sequence length exceeds the maximum position embedding.')
        else:
            self._max_seq_len = max_seq_len
github flypythoncom / cs224n_2019 / pytorch-bert-code / bert.py View on Github external
self.train_path = dataset + '/data/train.txt'                                # 训练集
        self.dev_path = dataset + '/data/dev.txt'                                    # 验证集
        self.test_path = dataset + '/data/test.txt'                                  # 测试集
        self.class_list = [x.strip() for x in open(
            dataset + '/data/class.txt').readlines()]                                # 类别名单
        self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt'        # 模型训练结果
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')   # 设备

        self.require_improvement = 1000                                 # 若超过1000batch效果还没提升,则提前结束训练
        self.num_classes = len(self.class_list)                         # 类别数
        self.num_epochs = 3                                             # epoch数
        self.batch_size = 128                                           # mini-batch大小
        self.pad_size = 32                                              # 每句话处理成的长度(短填长切)
        self.learning_rate = 5e-5                                       # 学习率
        self.bert_path = './bert'
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
        self.hidden_size = 768
github huggingface / transformers / examples / summarization / convert_bertabs_original_pytorch_checkpoint.py View on Github external
# -------------------
    # Convert the weights
    # -------------------

    logging.info("convert the model")
    new_model.bert.load_state_dict(original.bert.state_dict())
    new_model.decoder.load_state_dict(original.decoder.state_dict())
    new_model.generator.load_state_dict(original.generator.state_dict())

    # ----------------------------------
    # Make sure the outpus are identical
    # ----------------------------------

    logging.info("Make sure that the models' outputs are identical")
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # prepare the model inputs
    encoder_input_ids = tokenizer.encode("This is sample éàalj'-.")
    encoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(encoder_input_ids)))
    encoder_input_ids = torch.tensor(encoder_input_ids).unsqueeze(0)
    decoder_input_ids = tokenizer.encode("This is sample 3 éàalj'-.")
    decoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(decoder_input_ids)))
    decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0)

    # failsafe to make sure the weights reset does not affect the
    # loaded weights.
    assert torch.max(torch.abs(original.generator[0].weight - new_model.generator[0].weight)) == 0

    # forward pass
    src = encoder_input_ids
    tgt = decoder_input_ids
github IlyaGusev / summarus / external / presumm_scripts / convert_to_presumm.py View on Github external
def __init__(self, bert_model, lower, max_src_tokens, max_tgt_tokens):
        self.max_src_tokens = max_src_tokens
        self.max_tgt_tokens = max_tgt_tokens
        self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=lower)
        self.sep_token = '[SEP]'
        self.cls_token = '[CLS]'
        self.pad_token = '[PAD]'
        self.tgt_bos = '[unused1] '
        self.tgt_eos = ' [unused2]'
        self.tgt_sent_split = ' [unused3] '
        self.sep_vid = self.tokenizer.vocab[self.sep_token]
        self.cls_vid = self.tokenizer.vocab[self.cls_token]
        self.pad_vid = self.tokenizer.vocab[self.pad_token]