How to use the torchtext.data.Iterator.splits function in torchtext

To help you get started, we’ve selected a few torchtext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github mandarjoshi90 / pair2vec / noallen / data.py View on Github external
def read_data(config):
    args = Field(lower=True, tokenize='spacy') if config.compositional_args else Field()
    rels = Field(lower=True, tokenize='spacy') if config.relational_args else Field()
    
    #TODO we will need to add a header to the files
    data = TabularDataset(path=config.data_path, format='tsv', fields = [('subject', args), ('relation', rels), ('object', args)])
    train, dev = data.split(split_ratio=0.99)
    print('Train size:', len(train), '   Dev size:', len(dev))
    
    args.build_vocab(train)
    rels.build_vocab(train)
    config.n_args = len(args.vocab)
    config.n_rels = len(rels.vocab)
    print("#Args:", config.n_args, "   #Rels:", config.n_rels)
    
    train_iter, dev_iter = Iterator.splits((train, dev), batch_size=config.batch_size, device=args.gpu)
    train_iter.repeat = False
    
    #TODO need to figure out how to duplicate the relations field, and then detach it from the regular order. This'll allow us to effectively sample relations.
    
    return train_iter, dev_iter
github hoxmark / Deep_reinforcement_active_learning / original / main.py View on Github external
def mr(text_field, label_field, **kargs):
    train_data, dev_data = mydatasets.MR.splits(text_field, label_field)
    text_field.build_vocab(train_data, dev_data)
    label_field.build_vocab(train_data, dev_data)
    train_iter, dev_iter = data.Iterator.splits(
                                (train_data, dev_data),
                                batch_sizes=(args.batch_size, len(dev_data)),
                                **kargs)
    return train_iter, dev_iter
github bamtercelboo / cnn-lstm-bilstm-deepcnn-clstm-in-pytorch / main_hyperparams.py View on Github external
def mrs_two_mui(path, train_name, dev_name, test_name, char_data, text_field, label_field, static_text_field, static_label_field, **kargs):
    train_data, dev_data, test_data = mydatasets_self_two.MR.splits(path, train_name, dev_name, test_name,
                                                                    char_data, text_field, label_field)
    static_train_data, static_dev_data, static_test_data = mydatasets_self_two.MR.splits(path, train_name, dev_name,
                                                                                         test_name,
                                                                                         char_data, static_text_field,
                                                                                         static_label_field)
    print("len(train_data) {} ".format(len(train_data)))
    print("len(train_data) {} ".format(len(static_train_data)))
    text_field.build_vocab(train_data, min_freq=args.min_freq)
    label_field.build_vocab(train_data)
    static_text_field.build_vocab(static_train_data, static_dev_data, static_test_data, min_freq=args.min_freq)
    static_label_field.build_vocab(static_train_data, static_dev_data, static_test_data)
    train_iter, dev_iter, test_iter = data.Iterator.splits(
                                        (train_data, dev_data, test_data),
                                        batch_sizes=(args.batch_size,
                                                     len(dev_data),
                                                     len(test_data)),
                                        **kargs)
    return train_iter, dev_iter, test_iter
github bigboNed3 / chinese_text_cnn / main.py View on Github external
def load_dataset(text_field, label_field, args, **kwargs):
    train_dataset, dev_dataset = dataset.get_dataset('data', text_field, label_field)
    if args.static and args.pretrained_name and args.pretrained_path:
        vectors = load_word_vectors(args.pretrained_name, args.pretrained_path)
        text_field.build_vocab(train_dataset, dev_dataset, vectors=vectors)
    else:
        text_field.build_vocab(train_dataset, dev_dataset)
    label_field.build_vocab(train_dataset, dev_dataset)
    train_iter, dev_iter = data.Iterator.splits(
        (train_dataset, dev_dataset),
        batch_sizes=(args.batch_size, len(dev_dataset)),
        sort_key=lambda x: len(x.text),
        **kwargs)
    return train_iter, dev_iter
github bamtercelboo / cnn-lstm-bilstm-deepcnn-clstm-in-pytorch / main.py View on Github external
def mrs_five(text_field, label_field,  **kargs):
    train_data, dev_data, test_data = mydatasets_self_five.MR.splits(text_field, label_field)
    print("len(train_data) {} ".format(len(train_data)))
    text_field.build_vocab(train_data)
    label_field.build_vocab(train_data)
    train_iter, dev_iter, test_iter = data.Iterator.splits(
                                        (train_data, dev_data, test_data),
                                        batch_sizes=(args.batch_size,
                                                     len(dev_data),
                                                     len(test_data)),
                                        **kargs)
    return train_iter, dev_iter, test_iter
github bamtercelboo / cnn-lstm-bilstm-deepcnn-clstm-in-pytorch / main_hyperparams.py View on Github external
def mrs_five_mui(path, train_name, dev_name, test_name, char_data, text_field, label_field, static_text_field,
                 static_label_field, **kargs):
    train_data, dev_data, test_data = mydatasets_self_five.MR.splits(path, train_name, dev_name, test_name,
                                                                     char_data, text_field, label_field)
    static_train_data, static_dev_data, static_test_data = mydatasets_self_five.MR.splits(path, train_name, dev_name,
                                                                                          test_name,
                                                                                          char_data,
                                                                                         static_text_field,
                                                                                          static_label_field)
    print("len(train_data) {} ".format(len(train_data)))
    print("len(train_data) {} ".format(len(static_train_data)))
    text_field.build_vocab(train_data, min_freq=args.min_freq)
    label_field.build_vocab(train_data)
    static_text_field.build_vocab(static_train_data, static_dev_data, static_test_data, min_freq=args.min_freq)
    static_label_field.build_vocab(static_train_data, static_dev_data, static_test_data)
    train_iter, dev_iter, test_iter = data.Iterator.splits(
                                        (train_data, dev_data, test_data),
                                        batch_sizes=(args.batch_size,
                                                     len(dev_data),
                                                     len(test_data)),
                                        **kargs)
    return train_iter, dev_iter, test_iter
github SeojinBang / VIBI / return_data.py View on Github external
batch_first = True)
        label = data.Field(lower = True)
        label_pred = data.Field(use_vocab = False, fix_length = 1)
        fname = data.Field(use_vocab = False, fix_length = 1)
        
        train, valid, test = IMDB_modified.splits(text, label, label_pred, fname,
                                                  root = root, model_name = args.model_name,
                                                  load_pred = args.load_pred)
        print("build vocab...")
        text.build_vocab(train, vectors = GloVe(name = '6B',
                                                dim = embedding_dim,
                                                cache = root), max_size = max_total_num_words)
        label.build_vocab(train)
        
        print("Create Iterator objects for multiple splits of a dataset...")
        train_loader, valid_loader, test_loader = data.Iterator.splits((train, valid, test),
                                                                       batch_size = batch_size,
                                                                       device = device,
                                                                       repeat = False)
        
        data_loader['word_idx'] = text.vocab.itos
        data_loader['x_type'] = torch.cuda.LongTensor if args.cuda else torch.LongTensor
        data_loader['y_type'] = torch.cuda.LongTensor if args.cuda else torch.LongTensor
        data_loader['max_total_num_words'] = max_total_num_words
        data_loader['embedding_dim'] = embedding_dim
        data_loader['max_num_words'] = 50
        data_loader['max_num_sents'] = int(next(iter(train_loader)).text.size(-1) / data_loader['max_num_words'])

    else : raise UnknownDatasetError()
    
    data_loader['train'] = train_loader
    data_loader['valid'] = valid_loader
github bamtercelboo / cnn-lstm-bilstm-deepcnn-clstm-in-pytorch / version-18 / main_hyperparams.py View on Github external
def mrs_two_mui(path, train_name, dev_name, test_name, char_data, text_field, label_field, static_text_field, static_label_field, **kargs):
    train_data, dev_data, test_data = mydatasets_self_two.MR.splits(path, train_name, dev_name, test_name,
                                                                    char_data, text_field, label_field)
    static_train_data, static_dev_data, static_test_data = mydatasets_self_two.MR.splits(path, train_name, dev_name,
                                                                                         test_name,
                                                                                         char_data, static_text_field,
                                                                                         static_label_field)
    print("len(train_data) {} ".format(len(train_data)))
    print("len(train_data) {} ".format(len(static_train_data)))
    text_field.build_vocab(train_data, min_freq=args.min_freq)
    label_field.build_vocab(train_data)
    static_text_field.build_vocab(static_train_data, static_dev_data, static_test_data, min_freq=args.min_freq)
    static_label_field.build_vocab(static_train_data, static_dev_data, static_test_data)
    train_iter, dev_iter, test_iter = data.Iterator.splits(
                                        (train_data, dev_data, test_data),
                                        batch_sizes=(args.batch_size,
                                                     len(dev_data),
                                                     len(test_data)),
                                        **kargs)
    return train_iter, dev_iter, test_iter
github bamtercelboo / cnn-lstm-bilstm-deepcnn-clstm-in-pytorch / main.py View on Github external
def mrs_two(text_field, label_field,  **kargs):
    train_data, dev_data, test_data = mydatasets_self_two.MR.splits(text_field, label_field)
    print("len(train_data) {} ".format(len(train_data)))
    text_field.build_vocab(train_data)
    label_field.build_vocab(train_data)
    train_iter, dev_iter, test_iter = data.Iterator.splits(
                                        (train_data, dev_data, test_data),
                                        batch_sizes=(args.batch_size,
                                                     len(dev_data),
                                                     len(test_data)),
                                        **kargs)
    return train_iter, dev_iter, test_iter