How to use the transformers.DistilBertTokenizer function in transformers

To help you get started, we’ve selected a few transformers examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github huggingface / transformers / examples / run_lm_finetuning.py View on Github external
try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter


logger = logging.getLogger(__name__)


MODEL_CLASSES = {
    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
    "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
    "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
}


class TextDataset(Dataset):
    def __init__(self, tokenizer, args, file_path="train", block_size=512):
        assert os.path.isfile(file_path)
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, args.model_name_or_path + "_cached_lm_" + str(block_size) + "_" + filename
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
github huggingface / transformers / examples / run_ner.py View on Github external
logger = logging.getLogger(__name__)

ALL_MODELS = sum(
    (
        tuple(conf.pretrained_config_archive_map.keys())
        for conf in (BertConfig, RobertaConfig, DistilBertConfig, CamembertConfig, XLMRobertaConfig)
    ),
    (),
)

MODEL_CLASSES = {
    "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
    "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
    "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
    "camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer),
    "xlmroberta": (XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer),
}


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
    """ Train the model """
    if args.local_rank in [-1, 0]:
github explosion / spacy-transformers / spacy_transformers / _tokenizers.py View on Github external
by offsetting and accounting for special tokens."""
        offset = 0
        output = []
        for segment in segments:
            if segment:
                offset += 1
            seen = set()
            for idx_group in segment:
                output.append([idx + offset for idx in idx_group])
                seen.update({idx for idx in idx_group})
            offset += len(seen)
        return output


class SerializableDistilBertTokenizer(
    transformers.DistilBertTokenizer, SerializationMixin
):
    serialization_fields = list(BASE_CLASS_FIELDS) + [
        "vocab",
        "do_basic_tokenize",
        "do_lower_case",
        "never_split",
        "tokenize_chinese_chars",
    ]

    @classmethod
    def blank(cls):
        self = cls.__new__(cls)
        for field in self.serialization_fields:
            setattr(self, field, None)
        self.ids_to_tokens = None
        self.basic_tokenizer = None
github huggingface / transformers / examples / distillation / run_squad_w_distillation.py View on Github external
# The follwing import is the official SQuAD evaluation script (2.0).
# You can remove it from the dependencies if you are using this script outside of the library
# We've added it here for automated tests (see examples/test_examples.py file)
from ..utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad

logger = logging.getLogger(__name__)

ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
                  for conf in (BertConfig, XLNetConfig, XLMConfig)), ())

MODEL_CLASSES = {
    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
}

def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

def to_list(tensor):
    return tensor.detach().cpu().tolist()

def train(args, train_dataset, model, tokenizer, teacher=None):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()
github mindsdb / lightwood / lightwood / encoders / text / distilbert.py View on Github external
# Possible: speed, balance, accuracy
        self.aim = aim

        if self.aim == ENCODER_AIM.SPEED:
            # uses more memory, takes very long to train and outputs weird debugging statements to the command line,
            # consider waiting until it gets better or try to investigate why this happens
            # (changing the pretrained model doesn't seem to help)
            self._classifier_model_class = AlbertForSequenceClassification
            self._embeddings_model_class = AlbertModel
            self._tokenizer_class = AlbertTokenizer
            self._pretrained_model_name = 'albert-base-v2'
            self._model_max_len = 768
        if self.aim == ENCODER_AIM.BALANCE:
            self._classifier_model_class = DistilBertForSequenceClassification
            self._embeddings_model_class = DistilBertModel
            self._tokenizer_class = DistilBertTokenizer
            self._pretrained_model_name = 'distilbert-base-uncased'
            self._model_max_len = 768
        if self.aim == ENCODER_AIM.ACCURACY:
            self._classifier_model_class = DistilBertForSequenceClassification
            self._embeddings_model_class = DistilBertModel
            self._tokenizer_class = DistilBertTokenizer
            self._pretrained_model_name = 'distilbert-base-uncased'
            self._model_max_len = 768

        self.device, _ = get_devices()
github makcedward / nlpaug / nlpaug / model / lang_models / distilbert.py View on Github external
def __init__(self, model_path='distilbert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda'):
        super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p)
        self.model_path = model_path

        self.tokenizer = DistilBertTokenizer.from_pretrained(model_path)
        self.model = DistilBertForMaskedLM.from_pretrained(model_path)

        self.model.to(self.device)
        self.model.eval()
github bhoov / exbert / server / aligner / aligner.py View on Github external
# Functional version that works with dictionaries
            return [meta_token.assoc("token", b) for b in bpe_tokens[offset:]]

        def _doc_to_fixed_tokens(self, doc: SpacyDoc) -> List[str]:
            """Extract tokens from a document, accounting for exceptions only if needed"""
            tokens = doc_to_fixed_tokens(doc)
            return tokens
        
    return Aligner
        
english = "en_core_web_sm"

BertAligner = MakeAligner(BertTokenizer, english)
GPT2Aligner = MakeAligner(GPT2Tokenizer, english)
RobertaAligner = MakeAligner(RobertaTokenizer, english)
DistilBertAligner = MakeAligner(DistilBertTokenizer, english)
github kaushaltrivedi / fast-bert / fast_bert / data_lm.py View on Github external
RobertaForSequenceClassification,
    RobertaTokenizer,
    DistilBertConfig,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
)

MODEL_CLASSES = {
    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
    "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
    "distilbert": (
        DistilBertConfig,
        DistilBertForSequenceClassification,
        DistilBertTokenizer,
    ),
}

# Create text corpus suitable for language model training


def create_corpus(text_list, target_path, logger=None):

    nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner", "textcat"])

    with open(target_path, "w") as f:
        #  Split sentences for each document
        logger.info("Formatting corpus for {}".format(target_path))
        for text in progress_bar(text_list):

            text = fix_html(text)
github koursaros-ai / nboost / nboost / models / pt_models / distilbert_qa.py View on Github external
def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model = DistilBertForQuestionAnswering.from_pretrained(self.model_dir)
        self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_dir)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
github ThilinaRajapakse / simpletransformers / simpletransformers / classification / multi_label_classification_model.py View on Github external
"""
        Initializes a MultiLabelClassification model.

        Args:
            model_type: The type of model (bert, roberta)
            model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
            num_labels (optional): The number of labels or classes in the dataset.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
        """
        MODEL_CLASSES = {
            'bert': (BertConfig, BertForMultiLabelSequenceClassification, BertTokenizer),
            'roberta': (RobertaConfig, RobertaForMultiLabelSequenceClassification, RobertaTokenizer),
            'xlnet': (XLNetConfig, XLNetForMultiLabelSequenceClassification, XLNetTokenizer),
            'xlm': (XLMConfig, XLMForMultiLabelSequenceClassification, XLMTokenizer),
            'distilbert': (DistilBertConfig, DistilBertForMultiLabelSequenceClassification, DistilBertTokenizer),
        }

        config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
        self.tokenizer = tokenizer_class.from_pretrained(model_name)
        self.model = model_class.from_pretrained(model_name, num_labels=num_labels)
        self.num_labels = num_labels

        if use_cuda:
            if torch.cuda.is_available():
                self.device = torch.device("cuda")
            else:
                raise ValueError("'use_cuda' set to True when cuda is unavailable. Make sure CUDA is available or set use_cuda=False.")
        else:
            self.device = "cpu"

        self.results = {}