How to use the torchtext.data.BucketIterator.splits function in torchtext

To help you get started, we’ve selected a few torchtext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github kolloldas / torchnlp / tests / tasks / test_sequence_tagging.py View on Github external
def udpos_dataset(batch_size):
    # Setup fields with batch dimension first
    inputs = data.Field(init_token="", eos_token="", batch_first=True)
    tags = data.Field(init_token="", eos_token="", batch_first=True)
    
    # Download and the load default data.
    train, val, test = datasets.UDPOS.splits(
    fields=(('inputs_word', inputs), ('labels', tags), (None, None)))
    
    # Build vocab
    inputs.build_vocab(train.inputs)
    tags.build_vocab(train.tags)
    
    # Get iterators
    train_iter, val_iter, test_iter = data.BucketIterator.splits(
                            (train, val, test), batch_size=batch_size, 
                            device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
    train_iter.repeat = False
    return train_iter, val_iter, test_iter, inputs, tags
github THUDM / KOBE / SeqGen_Shuangqing / utils / reward_provider.py View on Github external
print('Loading dataset...')
    train_data, test_data = KCDataset.splits(TEXT, LABEL, root='../data')
    train_data, valid_data = train_data.split(random_state=random.seed(SEED))

    # Build the vocab
    print('Building vocab...')
    TEXT.build_vocab(train_data)
    LABEL.build_vocab(train_data)

    # And create the iterators.
    BATCH_SIZE = 64

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=BATCH_SIZE,
        device=device)

    # As previously, we'll create an instance of our `FastText` class.

    INPUT_DIM = len(TEXT.vocab)

    # model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)
    model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS,
                FILTER_SIZES, OUTPUT_DIM, DROPOUT)

    # Train the Model
    optimizer = optim.Adam(model.parameters())
    criterion = nn.BCEWithLogitsLoss()
github pbloem / former / experiments / classify.py View on Github external
# load the IMDB data
    if arg.final:
        train, test = datasets.IMDB.splits(TEXT, LABEL)

        TEXT.build_vocab(train, max_size=arg.vocab_size - 2)
        LABEL.build_vocab(train)

        train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=arg.batch_size, device=util.d())
    else:
        tdata, _ = datasets.IMDB.splits(TEXT, LABEL)
        train, test = tdata.split(split_ratio=0.8)

        TEXT.build_vocab(train, max_size=arg.vocab_size - 2) # - 2 to make space for  and 
        LABEL.build_vocab(train)

        train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=arg.batch_size, device=util.d())

    print(f'- nr. of training examples {len(train_iter)}')
    print(f'- nr. of {"test" if arg.final else "validation"} examples {len(test_iter)}')

    if arg.max_length < 0:
        mx = max([input.text[0].size(1) for input in train_iter])
        mx = mx * 2
        print(f'- maximum sequence length: {mx}')
    else:
        mx = arg.max_length

    # create the model
    model = former.CTransformer(emb=arg.embedding_size, heads=arg.num_heads, depth=arg.depth, seq_length=mx, num_tokens=arg.vocab_size, num_classes=NUM_CLS, max_pool=arg.max_pool)
    if torch.cuda.is_available():
        model.cuda()
github wabyking / TextClassificationBenchmark / utils.py View on Github external
train, test = datasets.IMDB.splits(TEXT, LABEL)
    elif opt.dataset=="sst":
        train, val, test = datasets.SST.splits( TEXT, LABEL, fine_grained=True, train_subtrees=True,
                                               filter_pred=lambda ex: ex.label != 'neutral')
    elif opt.dataset=="trec":
        train, test = datasets.TREC.splits(TEXT, LABEL, fine_grained=True)
    else:
        print("does not support this datset")
        
    TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(train)    
    # print vocab information
    print('len(TEXT.vocab)', len(TEXT.vocab))
    print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())

    train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=opt.batch_size,device=device,repeat=False,shuffle=True)

    opt.label_size= len(LABEL.vocab)    
    opt.vocab_size = len(TEXT.vocab)
    opt.embedding_dim= TEXT.vocab.vectors.size()[1]
    opt.embeddings = TEXT.vocab.vectors
    
    return train_iter, test_iter
github bamtercelboo / cnn-lstm-bilstm-deepcnn-clstm-in-pytorch / main.py View on Github external
def sst(text_field, label_field,  **kargs):
    print("SST")
    train_data, dev_data, test_data = sstdatasets.SST.splits(text_field, label_field, fine_grained=True)
    print("len(train_data) {} ".format(len(train_data)))
    text_field.build_vocab(train_data, dev_data, test_data)
    label_field.build_vocab(train_data, dev_data, test_data)
    train_iter, dev_iter, test_iter = data.BucketIterator.splits(
                                        (train_data, dev_data, test_data),
                                        batch_sizes=(args.batch_size, 
                                                     len(dev_data), 
                                                     len(test_data)),
                                        **kargs)
    return train_iter, dev_iter, test_iter
github keon / seq2seq / utils.py View on Github external
url = re.compile('(.*)')

    def tokenize_de(text):
        return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

    DE = Field(tokenize=tokenize_de, include_lengths=True,
               init_token='', eos_token='')
    EN = Field(tokenize=tokenize_en, include_lengths=True,
               init_token='', eos_token='')
    train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN))
    DE.build_vocab(train.src, min_freq=2)
    EN.build_vocab(train.trg, max_size=10000)
    train_iter, val_iter, test_iter = BucketIterator.splits(
            (train, val, test), batch_size=batch_size, repeat=False)
    return train_iter, val_iter, test_iter, DE, EN
github hoxmark / Deep_reinforcement_active_learning / original / main.py View on Github external
def sst(text_field, label_field,  **kargs):
    train_data, dev_data, test_data = datasets.SST.splits(text_field, label_field, fine_grained=True)
    text_field.build_vocab(train_data, dev_data, test_data)
    label_field.build_vocab(train_data, dev_data, test_data)
    train_iter, dev_iter, test_iter = data.BucketIterator.splits(
                                        (train_data, dev_data, test_data),
                                        batch_sizes=(args.batch_size,
                                                     len(dev_data),
                                                     len(test_data)),
                                        **kargs)
    return train_iter, dev_iter, test_iter
github shijx12 / AR-Tree / sst / evaluate.py View on Github external
if not args.fine_grained:
        filter_pred = lambda ex: ex.label != 'neutral'
    dataset_splits = datasets.SST.splits(
        root=args.datadir, text_field=text_field, label_field=label_field,
        fine_grained=args.fine_grained, train_subtrees=True,
        filter_pred=filter_pred)
    test_dataset = dataset_splits[2]

    text_field.build_vocab(*dataset_splits)
    label_field.build_vocab(*dataset_splits)
    text_field.vocab.id_to_word = lambda i: text_field.vocab.itos[i]
    text_field.vocab.id_to_tf = lambda i: text_field.freqs[i]

    print(f'Number of classes: {len(label_field.vocab)}')

    _, _, test_loader = data.BucketIterator.splits(
        datasets=dataset_splits, batch_size=args.batch_size, device=args.gpu)

    num_classes = len(label_field.vocab)
    model = SSTModel(
        typ='RL-SA',
        vocab=text_field.vocab,
        num_classes=num_classes, num_words=len(text_field.vocab),
        word_dim=args.word_dim, hidden_dim=args.hidden_dim,
        clf_hidden_dim=args.clf_hidden_dim,
        clf_num_layers=args.clf_num_layers,
        use_leaf_rnn=args.leaf_rnn,
        use_batchnorm=args.batchnorm,
        dropout_prob=args.dropout,
        bidirectional=args.bidirectional,
        cell_type=args.cell_type,
        att_type=args.att_type,
github MultiPath / MetaNMT / meta_nmt5.py View on Github external
else:
        return max(len(new.src), len(new.trg),  prev_max_len) * i

def dyn_batch_without_padding(new, i, sofar):
    if args.distillation:
        return sofar + max(len(new.src), len(new.trg), len(new.dec))
    else:
        return sofar + max(len(new.src), len(new.trg))


if args.batch_size == 1:  # speed-test: one sentence per batch.
    batch_size_fn = lambda new, count, sofar: count
else:
    batch_size_fn = dyn_batch_with_padding # dyn_batch_without_padding

train_real, dev_real = data.BucketIterator.splits(
    (train_data, dev_data), batch_sizes=(args.batch_size, args.valid_batch_size), device=args.gpu, shuffle=False,
    batch_size_fn=batch_size_fn, repeat=None if args.mode == 'train' else False)
aux_reals = [data.BucketIterator(dataset, batch_size=args.batch_size, device=args.gpu, train=True, batch_size_fn=batch_size_fn, shuffle=False)
            for dataset in aux_data]
logger.info("build the dataset. done!")


# ----------------------------------------------------------------------------------------------------------------- #
# model hyper-params:
logger.info('use default parameters of t2t-base')
hparams = {'d_model': 512, 'd_hidden': 512, 'n_layers': 6,
            'n_heads': 8, 'drop_ratio': 0.1, 'warmup': 16000} # ~32
args.__dict__.update(hparams)

# ----------------------------------------------------------------------------------------------------------------- #
# show the arg:
github lrank / Robust_and_Privacy_preserving_Text_Representations / Pytorch / sentiment / baseline_model.py View on Github external
# 2. data.TabularDataset
train_data, valid_data, test_data = data.TabularDataset.splits(path=dataset_path,
                                                               train="train.csv",
                                                               validation="valid.csv",
                                                               test="test.csv",
                                                               fields=[('text', TEXT), ('rating', RATING_LABEL), ('gender', GENDER_LABEL),
                                                                       ('age', AGE_LABEL), ('location', LOCALTION_LABEL)],
                                                               format="csv")

print("Number of train_data = {}".format(len(train_data)))
print("Number of valid_data = {}".format(len(valid_data)))
print("Number of test_data = {}".format(len(test_data)))
print("vars(train_data[0]) = {}\n".format(vars(train_data[0])))

# 3. data.BucketIterator
train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data),
                                                               batch_size=BATCH_SIZE,
                                                               device=device,
                                                               sort_key=lambda x: len(x.text))

# 4. Build vocab
# TEXT.build_vocab(train_data)
# unk_init=torch.Tensor.normal_)
# LABELS.build_vocab(train_data)
# print("vars(train_data[0]) = ", vars(train_data[0]))

# 4.1 (Optional) If build vocab with pre-trained word embedding vectors
TEXT.build_vocab(train_data, vectors="glove.6B.100d")
RATING_LABEL.build_vocab(train_data)
GENDER_LABEL.build_vocab(train_data)
AGE_LABEL.build_vocab(train_data)
LOCALTION_LABEL.build_vocab(train_data)