How to use the transformers.GPT2Tokenizer function in transformers

To help you get started, we’ve selected a few transformers examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github thu-coai / cotk / tests / dataloader / test_multi_turn_dialog.py View on Github external
def _load_ubuntucorpus(min_rare_vocab_times=0):
		from transformers import GPT2Tokenizer
		toker = PretrainedTokenizer(GPT2Tokenizer('./tests/dataloader/dummy_gpt2vocab/vocab.json', './tests/dataloader/dummy_gpt2vocab/merges.txt'))
		return UbuntuCorpus("./tests/dataloader/dummy_ubuntucorpus#Ubuntu", min_rare_vocab_times=min_rare_vocab_times, tokenizer=toker, pretrained="gpt2")
	return _load_ubuntucorpus
github thu-coai / cotk / tests / dataloader / test_sentence_classification.py View on Github external
def _load_sst(min_rare_vocab_times=0):
		from transformers import GPT2Tokenizer
		toker = PretrainedTokenizer(GPT2Tokenizer('./tests/dataloader/dummy_gpt2vocab/vocab.json', './tests/dataloader/dummy_gpt2vocab/merges.txt'))
		return SST("./tests/dataloader/dummy_sst#SST", tokenizer=toker, min_rare_vocab_times=min_rare_vocab_times, pretrained="gpt2")
	return _load_sst
github thu-coai / cotk / tests / dataloader / test_multi_turn_dialog.py View on Github external
def _load_switchboardcorpus(min_rare_vocab_times=0):
		from transformers import GPT2Tokenizer
		toker = PretrainedTokenizer(GPT2Tokenizer('./tests/dataloader/dummy_gpt2vocab/vocab.json', './tests/dataloader/dummy_gpt2vocab/merges.txt'))
		return SwitchboardCorpus("./tests/dataloader/dummy_switchboardcorpus#SwitchboardCorpus",
								 min_rare_vocab_times=min_rare_vocab_times, tokenizer=toker, pretrained="gpt2")
github flairNLP / flair / tests / test_transformer_embeddings.py View on Github external
def test_gpt2_embeddings():
    gpt_model: str = "gpt2-medium"

    tokenizer = GPT2Tokenizer.from_pretrained(gpt_model)
    model = GPT2Model.from_pretrained(
        pretrained_model_name_or_path=gpt_model, output_hidden_states=True
    )
    model.to(flair.device)
    model.eval()

    s: str = "Berlin and Munich have a lot of puppeteer to see ."

    with torch.no_grad():
        tokens = tokenizer.tokenize("<|endoftext|>" + s + "<|endoftext|>")

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)

        hidden_states = model(tokens_tensor)[-1]
github mgrankin / ru_transformers / run_lm_finetuning.py View on Github external
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, get_constant_schedule, get_cosine_schedule_with_warmup,
                                  BertConfig, BertForMaskedLM, BertTokenizer,
                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
                                  DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)

from sp_encoder import SPEncoder
from yt_encoder import YTEncoder

logger = logging.getLogger(__name__)


MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
}

@dataclass
class MovingLoss():
    steps:int=1000
    avg_loss = (0.0, 0.0)
    def add(self, batch_loss:float):
        k_s = 1 - 1/self.steps
        avg_loss = self.avg_loss
        self.avg_loss = (self.avg_loss[0] * k_s + batch_loss * (1-k_s),
                         self.avg_loss[1] * k_s + 1.0 * (1-k_s))
    @property
github bme-chatbots / dialogue-generation / src / data.py View on Github external
pin_memory=True,
                collate_fn=collate_fn)

            yield from example_loader

    return load_examples


TOKENIZER = {
    'xlnet-base-cased':     XLNetTokenizer,
    'xlnet-large-cased':    XLNetTokenizer,
    'distilgpt2':           GPT2Tokenizer,
    'gpt2':                 GPT2Tokenizer,
    'gpt2-medium':          GPT2Tokenizer,
    'gpt2-large':           GPT2Tokenizer,
    'gpt2-xl':              GPT2Tokenizer
}


def create_tokenizer(args):
    """
    Creates the tokenizer for the model and saves
    it in the model data directory if it does not exist.
    """
    data_dir = join(
        args.data_dir, args.data, args.model)

    tokenizer_path = join(
        data_dir, 'special_tokens_map.json')

    assert args.model in TOKENIZER, \
        'Available tokenizers: {} received `{}`'.format(
github mgrankin / ru_transformers / run_generation.py View on Github external
OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, 
                                    XLNetLMHeadModel, XLNetTokenizer, 
                                    TransfoXLLMHeadModel, TransfoXLTokenizer, )


logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop

ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig)), ())

MODEL_CLASSES = {
    'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
    'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    'xlnet': (XLNetLMHeadModel, XLNetTokenizer),
    'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer),
}

# Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
# in https://github.com/rusiaaman/XLNet-gen#methodology
# and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family
(except for Alexei and Maria) are discovered.
The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
remainder of the story. 1883 Western Siberia,
a young Grigori Rasputin is asked by his father and a group of men to perform magic.
Rasputin has a vision and denounces one of the men as a horse thief. Although his
father initially slaps him for making such an accusation, Rasputin watches as the
man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
github mgrankin / ru_transformers / debug_lm_finetuning.py View on Github external
import torch_xla.distributed.data_parallel as dp
import torch_xla.distributed.parallel_loader as pl
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
from filelock import FileLock
import contextlib

logger = logging.getLogger(__name__)

def log_info(*args, **kwargs):
    if xm.is_master_ordinal():
        logger.info(*args, **kwargs)

MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
}

@dataclass
class MovingLoss():
    steps:int=1000
    avg_loss = (0.0, 0.0)
    def add(self, batch_loss:float):
        k_s = 1 - 1/self.steps
        avg_loss = self.avg_loss
        self.avg_loss = (self.avg_loss[0] * k_s + batch_loss * (1-k_s),
                         self.avg_loss[1] * k_s + 1.0 * (1-k_s))
    @property
github huggingface / transformers / examples / run_generation.py View on Github external
XLMTokenizer,
    XLMWithLMHeadModel,
    XLNetLMHeadModel,
    XLNetTokenizer,
)


logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO,
)
logger = logging.getLogger(__name__)

MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop

MODEL_CLASSES = {
    "gpt2": (GPT2LMHeadModel, GPT2Tokenizer),
    "ctrl": (CTRLLMHeadModel, CTRLTokenizer),
    "openai-gpt": (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    "xlnet": (XLNetLMHeadModel, XLNetTokenizer),
    "transfo-xl": (TransfoXLLMHeadModel, TransfoXLTokenizer),
    "xlm": (XLMWithLMHeadModel, XLMTokenizer),
}

# Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
# in https://github.com/rusiaaman/XLNet-gen#methodology
# and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family
(except for Alexei and Maria) are discovered.
The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
remainder of the story. 1883 Western Siberia,
a young Grigori Rasputin is asked by his father and a group of men to perform magic.
Rasputin has a vision and denounces one of the men as a horse thief. Although his
github bhoov / exbert / server / aligner / aligner.py View on Github external
bpe_tokens = super().tokenize(s) # Can't do `self.tokenize` because it will normalize again

            # Functional version that works with dictionaries
            return [meta_token.assoc("token", b) for b in bpe_tokens[offset:]]

        def _doc_to_fixed_tokens(self, doc: SpacyDoc) -> List[str]:
            """Extract tokens from a document, accounting for exceptions only if needed"""
            tokens = doc_to_fixed_tokens(doc)
            return tokens
        
    return Aligner
        
english = "en_core_web_sm"

BertAligner = MakeAligner(BertTokenizer, english)
GPT2Aligner = MakeAligner(GPT2Tokenizer, english)
RobertaAligner = MakeAligner(RobertaTokenizer, english)
DistilBertAligner = MakeAligner(DistilBertTokenizer, english)