How to use the torchtext.data.TabularDataset.splits function in torchtext

To help you get started, we’ve selected a few torchtext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pytorch / text / test / data.py View on Github external
from torchtext import data


TEXT = data.Field()
LABELS = data.Field()

train, val, test = data.TabularDataset.splits(
    path='~/chainer-research/jmt-data/pos_wsj/pos_wsj', train='.train',
    validation='.dev', test='.test', format='tsv',
    fields=[('text', TEXT), ('labels', LABELS)])

print(train.fields)
print(len(train))
print(vars(train[0]))

train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train, val, test), batch_size=3, sort_key=lambda x: len(x.text), device="cuda:0")

LABELS.build_vocab(train.labels)
TEXT.build_vocab(train.text)

print(TEXT.vocab.freqs.most_common(10))
print(LABELS.vocab.itos)
github Andrew-Tierno / QuantizedTransformer / regular / dataloader.py View on Github external
def generate_test(data_path="../data_processed"):
    BOS_WORD = '<s>'
    EOS_WORD = '</s>'
    BLANK_WORD = ""
    SRC = data.Field(tokenize=tokenize_vi, pad_token=BLANK_WORD)
    TGT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD, 
        eos_token = EOS_WORD, pad_token=BLANK_WORD)
    MAX_LEN = 100
    train, val, test = data.TabularDataset.splits(
        path=data_path, train='train.tsv', test='test2013.tsv',
        validation='dev.tsv', fields=[('src',SRC), ('trg',TGT)], 
        format='tsv', filter_pred=mytestfilter)
    MIN_FREQ = 2
    SRC.build_vocab(train.src, min_freq=MIN_FREQ)
    TGT.build_vocab(train.trg, min_freq=MIN_FREQ)

    return (SRC, TGT, train, val, test)
github lrank / Robust_and_Privacy_preserving_Text_Representations / Pytorch / postag / baseline_model_torchtext.py View on Github external
BIDIRECTIONAL = True
DROUPOUT = 0.5
NUM_EPOCHS = 20


####################################
#          Preparing Data          #
####################################
# 1. data.Field()
TEXT = data.Field(include_lengths=True, pad_token='', unk_token='')
TAG_LABEL = data.LabelField()
AGE_LABEL = data.LabelField()
GENDER_LABEL = data.LabelField()

# 2. data.TabularDataset
train_data, test_data = data.TabularDataset.splits(path=TrustPilot_processed_dataset_path,
                                                   train="train.csv",
                                                   test="test.csv",
                                                   fields=[('text', TEXT), ('tag_label', TAG_LABEL),
                                                           ('age_label', AGE_LABEL), ('gender_label', GENDER_LABEL)],
                                                   format="csv")

# 3. Split train_data to train_data, valid_data
train_data, valid_data = train_data.split(random_state=random.seed(SEED))
print("Number of train_data = {}".format(len(train_data)))
print("Number of valid_data = {}".format(len(valid_data)))
print("Number of test_data = {}\n".format(len(test_data)))

# 4. data.BucketIterator
train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data),
                                                               batch_size=BATCH_SIZE,
                                                               device=device,
github xhuang31 / KEQA_WSDM19 / train_detection.py View on Github external
if not args.cuda:
    args.gpu = -1
if torch.cuda.is_available() and args.cuda:
    print("Note: You are using GPU for training")
    torch.cuda.set_device(args.gpu)
    torch.cuda.manual_seed(args.seed)
if torch.cuda.is_available() and not args.cuda:
    print("Warning: You have Cuda but not use it. You are using CPU for training.")

# Set up the data for training
TEXT = data.Field(lower=True)
ED = data.Field()
train = data.TabularDataset(path=os.path.join(args.output, 'dete_train.txt'), format='tsv', fields=[('text', TEXT), ('ed', ED)])
field = [('id', None), ('sub', None), ('entity', None), ('relation', None), ('obj', None), ('text', TEXT), ('ed', ED)]
dev, test = data.TabularDataset.splits(path=args.output, validation='valid.txt', test='test.txt', format='tsv', fields=field)
TEXT.build_vocab(train, dev, test)
ED.build_vocab(train, dev)

match_embedding = 0
if os.path.isfile(args.vector_cache):
    stoi, vectors, dim = torch.load(args.vector_cache)
    TEXT.vocab.vectors = torch.Tensor(len(TEXT.vocab), dim)
    for i, token in enumerate(TEXT.vocab.itos):
        wv_index = stoi.get(token, None)
        if wv_index is not None:
            TEXT.vocab.vectors[i] = vectors[wv_index]
            match_embedding += 1
        else:
            TEXT.vocab.vectors[i] = torch.FloatTensor(dim).uniform_(-0.25, 0.25)
else:
    print("Error: Need word embedding pt file")
github johnolafenwa / TorchFusion / torchfusion / lang / datasets / datasets.py View on Github external
:param val:
    :param test:
    :param skip_header:
    :param save_vocab_path:
    :param args:
    :return:
    """
    if os.path.exists(save_vocab_path) == False:
        os.mkdir(save_vocab_path)

    dataset_fields = []

    for field in fields:
        dataset_fields.append((field.name,field.field))
    print(dataset_fields)
    dataset = TabularDataset.splits(root_path,".data",train,val,test,fields=dataset_fields,skip_header=skip_header,format=format,**args)

    for f_input in fields:
        name = f_input.name
        field = f_input.field
        vocab = f_input.vocab

        if vocab is None:
            #verify if working properly
            field.build_vocab(*dataset,max_size=f_input.max_size, min_freq=f_input.min_freq,
                 vectors=f_input.vectors, unk_init=f_input.unk_init, vectors_cache=f_input.vectors_cache)

            with open(os.path.join(save_vocab_path,"{}.json".format(name)), "w") as jfile:
                json.dump(field.vocab.stoi,jfile,sort_keys=True)

        else:
            with open(vocab, "r") as jfile:
github songyingxin / TextClassification-Pytorch / Utils / SST2_utils.py View on Github external
def sst_word_char(path, word_field, char_field, label_field, batch_size, device, word_emb_file, char_emb_file, cache_dir):

    fields = {
        'text': [('text_word', word_field), ('text_char', char_field)],
        'label': ('label', label_field)
    }
    train, dev, test = data.TabularDataset.splits(
        path=path, train='train.jsonl', validation='dev.jsonl',
        test='test.jsonl', format='json', skip_header=True,
        fields=fields)
    
    word_vectors = vocab.Vectors(word_emb_file, cache_dir)
    char_vectors = vocab.Vectors(char_emb_file, cache_dir)

    word_field.build_vocab(
        train, dev, test, max_size=25000,
        vectors=word_vectors, unk_init=torch.Tensor.normal_)
    char_field.build_vocab(
        train, dev, test, max_size=94,
        vectors=char_vectors, unk_init=torch.Tensor.normal_)
    
    label_field.build_vocab(train, dev, test)
github Shivanshu-Gupta / Visual-Question-Answering / preprocess.py View on Github external
def _create_loaders(path, traintsv, valtsv):
    def parse_int(tok, *args):
        return int(tok)
    quesid = data.Field(sequential=False, use_vocab=False, postprocessing=data.Pipeline(parse_int))
    ques = data.Field(include_lengths=True)
    imgid = data.Field(sequential=False, use_vocab=False, postprocessing=data.Pipeline(parse_int))
    ans = data.Field(sequential=False, use_vocab=False, postprocessing=data.Pipeline(parse_int))
    train_data, val_data = data.TabularDataset.splits(path=path, train=traintsv, validation=valtsv,
                                                      fields=[('quesid', quesid), ('ques', ques), ('imgid', imgid), ('ans', ans)],
                                                      format='tsv')
    batch_sizes = (1, 1)
    train_loader, val_loader = data.BucketIterator.splits((train_data, val_data), batch_sizes=batch_sizes, repeat=False, sort_key=lambda x: len(x.ques))
    ques.build_vocab(train_data)
    print('vocabulary size: {}'.format(len(ques.vocab.stoi)))
    return ques, train_loader, val_loader
github galsang / BiDAF-pytorch / model / data.py View on Github external
'question': [('q_word', self.WORD), ('q_char', self.CHAR)]}

        list_fields = [('id', self.RAW), ('s_idx', self.LABEL), ('e_idx', self.LABEL),
                       ('c_word', self.WORD), ('c_char', self.CHAR),
                       ('q_word', self.WORD), ('q_char', self.CHAR)]

        if os.path.exists(dataset_path):
            print("loading splits...")
            train_examples = torch.load(train_examples_path)
            dev_examples = torch.load(dev_examples_path)

            self.train = data.Dataset(examples=train_examples, fields=list_fields)
            self.dev = data.Dataset(examples=dev_examples, fields=list_fields)
        else:
            print("building splits...")
            self.train, self.dev = data.TabularDataset.splits(
                path=path,
                train=f'{args.train_file}l',
                validation=f'{args.dev_file}l',
                format='json',
                fields=dict_fields)

            os.makedirs(dataset_path)
            torch.save(self.train.examples, train_examples_path)
            torch.save(self.dev.examples, dev_examples_path)

        #cut too long context in the training set for efficiency.
        if args.context_threshold &gt; 0:
            self.train.examples = [e for e in self.train.examples if len(e.c_word) &lt;= args.context_threshold]

        print("building vocab...")
        self.CHAR.build_vocab(self.train, self.dev)
github Shivanshu-Gupta / Pytorch-POS-Tagger / main.py View on Github external
def load_datasets():
    text = data.Field(include_lengths=True)
    tags = data.Field()
    train_data, val_data, test_data = data.TabularDataset.splits(path='RNN_Data_files/', train='train_data.tsv', validation='val_data.tsv', test='val_data.tsv', fields=[('text', text), ('tags', tags)], format='tsv')

    batch_sizes = (args.batch_size, args.batch_size, args.batch_size)
    train_loader, val_loader, test_loader = data.BucketIterator.splits((train_data, val_data, test_data), batch_sizes=batch_sizes, sort_key=lambda x: len(x.text))

    text.build_vocab(train_data)
    tags.build_vocab(train_data)
    dataloaders = {'train': train_loader,
                   'validation': val_loader,
                   'test': val_loader}
    return text, tags, dataloaders