Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# vocab.txt
bert_path = './bert'
do_lower_case=True
bert_config_file = os.path.join(bert_path, f'bert_config.json')
vocab_file = os.path.join(bert_path, f'vocab.txt')
init_checkpoint = os.path.join(bert_path, f'pytorch_model.bin')
#加载配置
bert_config = BertConfig.from_json_file(bert_config_file)
# 加载词典
tokenizer = tokenization(vocab_file=vocab_file, do_lower_case=do_lower_case)
# 加载模型
model_bert = BertModel.from_pretrained(bert_path,config=bert_config)
model_bert.to(device)
# Tokenize input
text = "乌兹别克斯坦议会立法院主席获连任"
tokenized_text = tokenizer.tokenize(text)
tokenized_text=['[CLS]'] + tokenized_text + ['[SEP]']
# Convert token to vocabulary indices
# input_ids:一个形状为[batch_size, sequence_length]的torch.LongTensor,在词汇表中包含单词的token索引
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
# segment_ids :形状[batch_size, sequence_length]的可选torch.LongTensor,在[0, 1]中选择token类型索引。类型0对应于句子A,类型1对应于句子B。
segment_ids = [0]*len(input_ids)
# input_mask:一个可选的torch.LongTensor,形状为[batch_size, sequence_length],索引在[0, 1]中选择。
input_mask = [1]*len(input_ids)
def __init__(self, config):
super(MyModel, self).__init__()
self.config = config
self.bert = BertModel.from_pretrained(config.pretrained_model_path)
self.tag_linear = nn.Linear(self.bert.config.hidden_size, 5)
self.dropout = nn.Dropout(config.dropout_prob)
self.loss_func = nn.CrossEntropyLoss()
self.theta = config.theta
def __init__(self, bert_config, args):
super(BertClassifier, self).__init__(bert_config)
self.bert = BertModel.from_pretrained(args.model_name_or_path, config=bert_config) # Load pretrained bert
self.num_labels = bert_config.num_labels
self.label_classifier = FCLayer(bert_config.hidden_size, bert_config.num_labels, args.dropout_rate, use_activation=False)
def __init__(self, bert_config, model_config, device, slot_dim, intent_dim, intent_weight=None):
super(JointBERT, self).__init__(bert_config)
self.slot_num_labels = slot_dim
self.intent_num_labels = intent_dim
self.device = device
self.intent_weight = intent_weight if intent_weight is not None else torch.tensor([1.]*intent_dim)
self.bert = BertModel(bert_config)
self.dropout = nn.Dropout(model_config['dropout'])
self.context = model_config['context']
self.finetune = model_config['finetune']
self.context_grad = model_config['context_grad']
if self.context:
self.intent_classifier = nn.Linear(2 * bert_config.hidden_size, self.intent_num_labels)
self.slot_classifier = nn.Linear(2 * bert_config.hidden_size, self.slot_num_labels)
self.intent_hidden = nn.Linear(2 * bert_config.hidden_size, 2 * bert_config.hidden_size)
self.slot_hidden = nn.Linear(2 * bert_config.hidden_size, 2 * bert_config.hidden_size)
else:
self.intent_classifier = nn.Linear(bert_config.hidden_size, self.intent_num_labels)
self.slot_classifier = nn.Linear(bert_config.hidden_size, self.slot_num_labels)
self.intent_hidden = nn.Linear(bert_config.hidden_size, bert_config.hidden_size)
self.slot_hidden = nn.Linear(bert_config.hidden_size, bert_config.hidden_size)
self.intent_loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=self.intent_weight)
self.slot_loss_fct = torch.nn.CrossEntropyLoss()
def __init__(self, config):
super().__init__(config)
self.bert = BertModel(config)
# self.dropout = nn.Dropout(config.hidden_dropout_prob)
#The classification layer that takes the [CLS] representation and outputs the logit
self.cls_layer = nn.Linear(config.hidden_size, 1)
from transformers import BertJapaneseTokenizer, AlbertTokenizer, CamembertTokenizer
from transformers import AlbertModel, CamembertModel, BertModel
except ImportError:
msg = "importing bert dep failed."
msg += "\n try to install sister by `pip install sister[bert]`."
raise ImportError(msg)
if lang == "en":
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
model = AlbertModel.from_pretrained("albert-base-v2")
elif lang == "fr":
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertModel.from_pretrained("camembert-base")
elif lang == "ja":
tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")
model = BertModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")
self.tokenizer = tokenizer
self.model = model
def __init__(self, pretrain_path, max_length, cat_entity_rep=False):
nn.Module.__init__(self)
self.bert = BertModel.from_pretrained(pretrain_path)
self.max_length = max_length
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
self.cat_entity_rep = cat_entity_rep
def __init__(self, config):
super(BertForQAEmbed, self).__init__(config)
self.bert = transformers.BertModel(config)
self.dropout = nn.Dropout(0.1)
self.q_fnn_layer = FnnLayer(config.hidden_size)
self.a_fnn_layer = FnnLayer(config.hidden_size)
self.init_weights()
'''
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
import torch
import torch.nn as nn
from transformers import BertModel, BertPreTrainedModel, RobertaModel, AlbertModel
PRETRAINED_MODEL_MAP = {
'bert': BertModel,
'roberta': RobertaModel,
'albert': AlbertModel
}
class FCLayer(nn.Module):
def __init__(self, input_dim, output_dim, dropout_rate=0., use_activation=True):
super(FCLayer, self).__init__()
self.use_activation = use_activation
self.dropout = nn.Dropout(dropout_rate)
self.linear = nn.Linear(input_dim, output_dim)
self.tanh = nn.Tanh()
def forward(self, x):
x = self.dropout(x)
if self.use_activation:
def get_kobert_model():
""" Return BertModel for Kobert """
model = BertModel.from_pretrained('monologg/kobert')
return model