Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _load_ubuntucorpus(min_rare_vocab_times=0):
from transformers import GPT2Tokenizer
toker = PretrainedTokenizer(GPT2Tokenizer('./tests/dataloader/dummy_gpt2vocab/vocab.json', './tests/dataloader/dummy_gpt2vocab/merges.txt'))
return UbuntuCorpus("./tests/dataloader/dummy_ubuntucorpus#Ubuntu", min_rare_vocab_times=min_rare_vocab_times, tokenizer=toker, pretrained="gpt2")
return _load_ubuntucorpus
def _load_sst(min_rare_vocab_times=0):
from transformers import GPT2Tokenizer
toker = PretrainedTokenizer(GPT2Tokenizer('./tests/dataloader/dummy_gpt2vocab/vocab.json', './tests/dataloader/dummy_gpt2vocab/merges.txt'))
return SST("./tests/dataloader/dummy_sst#SST", tokenizer=toker, min_rare_vocab_times=min_rare_vocab_times, pretrained="gpt2")
return _load_sst
def _load_switchboardcorpus(min_rare_vocab_times=0):
from transformers import GPT2Tokenizer
toker = PretrainedTokenizer(GPT2Tokenizer('./tests/dataloader/dummy_gpt2vocab/vocab.json', './tests/dataloader/dummy_gpt2vocab/merges.txt'))
return SwitchboardCorpus("./tests/dataloader/dummy_switchboardcorpus#SwitchboardCorpus",
min_rare_vocab_times=min_rare_vocab_times, tokenizer=toker, pretrained="gpt2")
def test_gpt2_embeddings():
gpt_model: str = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(gpt_model)
model = GPT2Model.from_pretrained(
pretrained_model_name_or_path=gpt_model, output_hidden_states=True
)
model.to(flair.device)
model.eval()
s: str = "Berlin and Munich have a lot of puppeteer to see ."
with torch.no_grad():
tokens = tokenizer.tokenize("<|endoftext|>" + s + "<|endoftext|>")
indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
tokens_tensor = torch.tensor([indexed_tokens])
tokens_tensor = tokens_tensor.to(flair.device)
hidden_states = model(tokens_tensor)[-1]
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, get_constant_schedule, get_cosine_schedule_with_warmup,
BertConfig, BertForMaskedLM, BertTokenizer,
GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
from sp_encoder import SPEncoder
from yt_encoder import YTEncoder
logger = logging.getLogger(__name__)
MODEL_CLASSES = {
'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
}
@dataclass
class MovingLoss():
steps:int=1000
avg_loss = (0.0, 0.0)
def add(self, batch_loss:float):
k_s = 1 - 1/self.steps
avg_loss = self.avg_loss
self.avg_loss = (self.avg_loss[0] * k_s + batch_loss * (1-k_s),
self.avg_loss[1] * k_s + 1.0 * (1-k_s))
@property
pin_memory=True,
collate_fn=collate_fn)
yield from example_loader
return load_examples
TOKENIZER = {
'xlnet-base-cased': XLNetTokenizer,
'xlnet-large-cased': XLNetTokenizer,
'distilgpt2': GPT2Tokenizer,
'gpt2': GPT2Tokenizer,
'gpt2-medium': GPT2Tokenizer,
'gpt2-large': GPT2Tokenizer,
'gpt2-xl': GPT2Tokenizer
}
def create_tokenizer(args):
"""
Creates the tokenizer for the model and saves
it in the model data directory if it does not exist.
"""
data_dir = join(
args.data_dir, args.data, args.model)
tokenizer_path = join(
data_dir, 'special_tokens_map.json')
assert args.model in TOKENIZER, \
'Available tokenizers: {} received `{}`'.format(
OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
XLNetLMHeadModel, XLNetTokenizer,
TransfoXLLMHeadModel, TransfoXLTokenizer, )
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig)), ())
MODEL_CLASSES = {
'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
'xlnet': (XLNetLMHeadModel, XLNetTokenizer),
'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer),
}
# Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
# in https://github.com/rusiaaman/XLNet-gen#methodology
# and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family
(except for Alexei and Maria) are discovered.
The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
remainder of the story. 1883 Western Siberia,
a young Grigori Rasputin is asked by his father and a group of men to perform magic.
Rasputin has a vision and denounces one of the men as a horse thief. Although his
father initially slaps him for making such an accusation, Rasputin watches as the
man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
import torch_xla.distributed.data_parallel as dp
import torch_xla.distributed.parallel_loader as pl
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
from filelock import FileLock
import contextlib
logger = logging.getLogger(__name__)
def log_info(*args, **kwargs):
if xm.is_master_ordinal():
logger.info(*args, **kwargs)
MODEL_CLASSES = {
'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
}
@dataclass
class MovingLoss():
steps:int=1000
avg_loss = (0.0, 0.0)
def add(self, batch_loss:float):
k_s = 1 - 1/self.steps
avg_loss = self.avg_loss
self.avg_loss = (self.avg_loss[0] * k_s + batch_loss * (1-k_s),
self.avg_loss[1] * k_s + 1.0 * (1-k_s))
@property
XLMTokenizer,
XLMWithLMHeadModel,
XLNetLMHeadModel,
XLNetTokenizer,
)
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO,
)
logger = logging.getLogger(__name__)
MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop
MODEL_CLASSES = {
"gpt2": (GPT2LMHeadModel, GPT2Tokenizer),
"ctrl": (CTRLLMHeadModel, CTRLTokenizer),
"openai-gpt": (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
"xlnet": (XLNetLMHeadModel, XLNetTokenizer),
"transfo-xl": (TransfoXLLMHeadModel, TransfoXLTokenizer),
"xlm": (XLMWithLMHeadModel, XLMTokenizer),
}
# Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
# in https://github.com/rusiaaman/XLNet-gen#methodology
# and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family
(except for Alexei and Maria) are discovered.
The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
remainder of the story. 1883 Western Siberia,
a young Grigori Rasputin is asked by his father and a group of men to perform magic.
Rasputin has a vision and denounces one of the men as a horse thief. Although his
bpe_tokens = super().tokenize(s) # Can't do `self.tokenize` because it will normalize again
# Functional version that works with dictionaries
return [meta_token.assoc("token", b) for b in bpe_tokens[offset:]]
def _doc_to_fixed_tokens(self, doc: SpacyDoc) -> List[str]:
"""Extract tokens from a document, accounting for exceptions only if needed"""
tokens = doc_to_fixed_tokens(doc)
return tokens
return Aligner
english = "en_core_web_sm"
BertAligner = MakeAligner(BertTokenizer, english)
GPT2Aligner = MakeAligner(GPT2Tokenizer, english)
RobertaAligner = MakeAligner(RobertaTokenizer, english)
DistilBertAligner = MakeAligner(DistilBertTokenizer, english)