How to use the torchtext.data.BucketIterator function in torchtext

To help you get started, we’ve selected a few torchtext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github AnubhavGupta3377 / Text-Classification-Models-Pytorch / Model_TextRNN / utils.py View on Github external
test_examples = [data.Example.fromlist(i, datafields) for i in test_df.values.tolist()]
        test_data = data.Dataset(test_examples, datafields)
        
        # If validation file exists, load it. Otherwise get validation data from training data
        if val_file:
            val_df = self.get_pandas_df(val_file)
            val_examples = [data.Example.fromlist(i, datafields) for i in val_df.values.tolist()]
            val_data = data.Dataset(val_examples, datafields)
        else:
            train_data, val_data = train_data.split(split_ratio=0.8)
        
        TEXT.build_vocab(train_data, vectors=Vectors(w2v_file))
        self.word_embeddings = TEXT.vocab.vectors
        self.vocab = TEXT.vocab
        
        self.train_iterator = data.BucketIterator(
            (train_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)
        
        self.val_iterator, self.test_iterator = data.BucketIterator.splits(
            (val_data, test_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=False)
        
        print ("Loaded {} training examples".format(len(train_data)))
        print ("Loaded {} test examples".format(len(test_data)))
        print ("Loaded {} validation examples".format(len(val_data)))
github microsoft / samples-for-ai / examples / NLPTutorials / paraphrase_identification / paraphrase_identification.py View on Github external
def get_msrp_iter(args):
    if not os.path.exists(args.data_dir):
        os.mkdir(args.data_dir)

    TEXT = data.Field(lower=True, tokenize=tokenize_line_en)
    LABELS = data.Field(batch_first=True)
    train, val, test = MSRPDataset.splits(fields=(('sentence1', TEXT), ('sentence2', TEXT), ('labels', LABELS)), root=args.data_dir)
    TEXT.build_vocab(chain(train.sentence1, train.sentence2))
    LABELS.build_vocab(train.labels)
    print('Number of train dataset:', len(train))
    print('Number of validation dataset:', len(test))
    train_iter, val_iter, test_iter = data.BucketIterator.splits((train, val, test), batch_size=args.batch_size, device='cuda' if torch.cuda.is_available() else None)
    return train_iter, val_iter, test_iter, TEXT.vocab, LABELS.vocab
github schelotto / Neural_Speed_Reading_via_Skim-RNN_PyTorch / main.py View on Github external
def load_data(text_field, label_field, **kwargs):
    train_data, test_data, _ = SST.splits(text_field, label_field,
                                          filter_pred=lambda ex: ex.label != 'neutral')
    text_field.build_vocab(train_data, vectors=GloVe()),
    label_field.build_vocab(train_data, test_data)
    train_iter, test_iter = data.BucketIterator.splits(
        (train_data, test_data),
        batch_sizes=(args.batch_size, args.batch_size),
        shuffle=args.shuffle,
        **kwargs
    )
    return train_iter, test_iter
github MillionIntegrals / vel / vel / data / source / nlp / wmt14.py View on Github external
)

    de_field = data.Field(
        lower=True, tokenize=tokenize_de, batch_first=True, init_token='', eos_token=''
    )

    train_source, val_source, test_source = WMT14Cached.splits(
        root=path,
        exts=('.en', '.de'),
        fields=(en_field, de_field)
    )

    en_field.build_vocab(train_source.src, min_freq=2)
    de_field.build_vocab(train_source.tgt, max_size=17_000)

    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train_source, val_source, test_source),
        batch_size=batch_size,
        repeat=False
    )

    return SupervisedTextData(
        train_source, val_source, train_iter, val_iter, en_field, de_field
    )
github galsang / BiDAF-pytorch / model / data.py View on Github external
os.makedirs(dataset_path)
            torch.save(self.train.examples, train_examples_path)
            torch.save(self.dev.examples, dev_examples_path)

        #cut too long context in the training set for efficiency.
        if args.context_threshold > 0:
            self.train.examples = [e for e in self.train.examples if len(e.c_word) <= args.context_threshold]

        print("building vocab...")
        self.CHAR.build_vocab(self.train, self.dev)
        self.WORD.build_vocab(self.train, self.dev, vectors=GloVe(name='6B', dim=args.word_dim))

        print("building iterators...")
        device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
        self.train_iter, self.dev_iter = \
            data.BucketIterator.splits((self.train, self.dev),
                                       batch_sizes=[args.train_batch_size, args.dev_batch_size],
                                       device=device,
                                       sort_key=lambda x: len(x.c_word))
github IBM / pytorch-seq2seq / seq2seq / trainer / supervised_trainer.py View on Github external
def _train_epoches(self, data, model, n_epochs, start_epoch, start_step,
                       dev_data=None, teacher_forcing_ratio=0):
        log = self.logger

        print_loss_total = 0  # Reset every print_every
        epoch_loss_total = 0  # Reset every epoch

        device = None if torch.cuda.is_available() else -1
        batch_iterator = torchtext.data.BucketIterator(
            dataset=data, batch_size=self.batch_size,
            sort_key=lambda x: -len(x.src),
            device=device, repeat=False)

        steps_per_epoch = len(batch_iterator)
        total_steps = steps_per_epoch * n_epochs

        step = start_step
        step_elapsed = 0
        for epoch in range(start_epoch, n_epochs + 1):
            log.debug("Epoch: %d, Step: %d" % (epoch, step))

            batch_generator = batch_iterator.__iter__()
            # consuming seen batches from previous training
            for _ in range((epoch - 1) * steps_per_epoch, step):
                next(batch_generator)
github jadore801120 / attention-is-all-you-need-pytorch / train.py View on Github external
opt.trg_pad_idx = data['vocab']['trg'].vocab.stoi[Constants.PAD_WORD]

    opt.src_vocab_size = len(data['vocab']['src'].vocab)
    opt.trg_vocab_size = len(data['vocab']['trg'].vocab)

    #========= Preparing Model =========#
    if opt.embs_share_weight:
        assert data['vocab']['src'].vocab.stoi == data['vocab']['trg'].vocab.stoi, \
            'To sharing word embedding the src/trg word2idx table shall be the same.'

    fields = {'src': data['vocab']['src'], 'trg':data['vocab']['trg']}

    train = Dataset(examples=data['train'], fields=fields)
    val = Dataset(examples=data['valid'], fields=fields)

    train_iterator = BucketIterator(train, batch_size=batch_size, device=device, train=True)
    val_iterator = BucketIterator(val, batch_size=batch_size, device=device)

    return train_iterator, val_iterator
github hhsecond / HandsOnDeepLearningWithPytorch / 5.SequentialDataProcessing / AdvancedRNN / train.py View on Github external
batch_size = 64
inputs = data.Field(lower=True)
answers = data.Field(sequential=False)

train, dev, test = datasets.SNLI.splits(inputs, answers)

inputs.build_vocab(train, dev, test)
vector = os.path.join(USERHOME, '.vector_cache', 'glove.6B.300d.txt.pt')
if os.path.isfile(vector):
    # TODO - make it customizable
    inputs.vocab.vectors = torch.load(vector)
else:
    inputs.vocab.load_vectors('glove.6B.300d')
answers.build_vocab(train)

train_iter, dev_iter, test_iter = data.BucketIterator.splits(
    (train, dev, test), batch_size=batch_size)

vocab_dim = len(inputs.vocab)
out_dim = len(answers.vocab)
embed_dim = 300
cells = 2
birnn = True
lr = 0.01
epochs = 10
if birnn:
    cells *= 2
dropout = 0.5
fc1_dim = 50
fc2_dim = 3
n_layers = 2
network_type = 'LSTM'
github Stark-JC / code-for-nlp-beginner / Task3-Natural Language Inference / util.py View on Github external
train_data, dev_data, test_data = data.TabularDataset.splits(
        path=data_path,
        train='snli_1.0_train.jsonl',
        validation='snli_1.0_dev.jsonl',
        test='snli_1.0_test.jsonl',
        format='json',
        fields=fields,
        filter_pred=lambda ex: ex.label != '-'  # filter the example which label is '-'(means unlabeled)
    )
    if vectors is not None:
        TEXT.build_vocab(train_data, vectors=vectors, unk_init=torch.Tensor.normal_)
    else:
        TEXT.build_vocab(train_data)
    LABEL.build_vocab(dev_data)

    train_iter, dev_iter = BucketIterator.splits(
        (train_data, dev_data),
        batch_sizes=(batch_size, batch_size),
        device=device,
        sort_key=lambda x: len(x.premise) + len(x.hypothesis),
        sort_within_batch=True,
        repeat=False,
        shuffle=True
    )

    test_iter = Iterator(test_data,
                         batch_size=batch_size,
                         device=device,
                         sort=False,
                         sort_within_batch=False,
                         repeat=False,
                         shuffle=False)
github NLPatVCU / medinify / medinify / datasets / cnn_dataset.py View on Github external
def get_data_loaders(self, w2v_file, train_file=None, validation_file=None,
                         train_comments=None, train_rating=None,
                         validation_comment=None, validation_ratings=None):
        train_dataset = self.get_dataset(comments=train_comments,
                                         ratings=train_rating,
                                         review_file=train_file)
        validation_dataset = self.get_dataset(comments=validation_comment,
                                              ratings=validation_ratings,
                                              review_file=validation_file)
        datasets = [train_dataset, validation_dataset]

        self._build_vocabs(datasets, w2v_file)

        train_loader = BucketIterator(train_dataset, config.BATCH_SIZE)
        validation_loader = BucketIterator(validation_dataset, config.BATCH_SIZE)
        return train_loader, validation_loader