How to use the torchtext.data function in torchtext

To help you get started, we’ve selected a few torchtext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github nhatsmrt / nn-toolbox / test_lm.py View on Github external
from nntoolbox.utils import get_device
from nntoolbox.sequence.models import LanguageModel
from nntoolbox.sequence.learner import LanguageModelLearner
from nntoolbox.sequence.components import AdditiveContextEmbedding
from nntoolbox.sequence.utils import load_embedding
from torch import nn
from torch.optim import Adam
import torch
from nntoolbox.callbacks import *
from nntoolbox.metrics import *


MAX_VOCAB_SIZE = 25000
BATCH_SIZE = 16

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

# train_iterator, val_iterator, test_iterator = WikiText2.iters()
# for tmp in train_iterator:
#     print(tmp)


train_data, val_data, test_data = WikiText2.splits(TEXT)
train_iterator = data.BPTTIterator(
    train_data,
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    device=get_device(),
    bptt_len=35,
    shuffle=True
)
github castorini / BuboQA / ferhan_simple_qa_rnn / relation_prediction / predict_one_instance.py View on Github external
print("WARNING: You have CUDA but not using it.")
if torch.cuda.is_available() and args.cuda:
    torch.cuda.set_device(args.gpu)
    torch.cuda.manual_seed(args.seed)

if not args.trained_model:
    print("ERROR: You need to provide a option 'trained_model' path to load the model.")
    sys.exit(1)

# ---- get the Field, Dataset, Iterator for train/dev/test sets -----
tokenizer = TreebankWordTokenizer()
def tokenize_text():
    return lambda text: tokenizer.tokenize(text)

questions = data.Field(lower=True, tokenize=tokenize_text())
relations = data.Field(sequential=False)

train, dev, test = SimpleQaRelationDataset.splits(questions, relations)
train_iter, dev_iter, test_iter = SimpleQaRelationDataset.iters(args, questions, relations, train, dev, test, shuffleTrain=False)

# load the model

config = args
config.n_embed = len(questions.vocab) # vocab. size / number of embeddings
config.d_out = len(relations.vocab)
config.n_cells = config.n_layers
# double the number of cells for bidirectional networks
if config.birnn:
    config.n_cells *= 2
print(config)

model = RelationClassifier(config)
github JayParks / transformer / data / data_utils.py View on Github external
                          sort_key=lambda x: data.interleave_keys(len(x.src), len(x.trg)),
                          repeat=False, shuffle=True, device=device)
github kh-kim / simple-ntc / simple_ntc / data_loader.py View on Github external
):
        '''
        DataLoader initialization.
        :param train_fn: Train-set filename
        :param batch_size: Batchify data fot certain batch size.
        :param device: Device-id to load data (-1 for CPU)
        :param max_vocab: Maximum vocabulary size
        :param min_freq: Minimum frequency for loaded word.
        :param use_eos: If it is True, put  after every end of sentence.
        :param shuffle: If it is True, random shuffle the input data.
        '''
        super().__init__()

        # Define field of the input file.
        # The input file consists of two fields.
        self.label = data.Field(
            sequential=False,
            use_vocab=True,
            unk_token=None
        )
        self.text = data.Field(
            use_vocab=True,
            batch_first=True,
            include_lengths=False,
            eos_token='' if use_eos else None
        )

        # Those defined two columns will be delimited by TAB.
        # Thus, we use TabularDataset to load two columns in the input file.
        # We would have two separate input file: train_fn, valid_fn
        # Files consist of two columns: label field and text field.
        train, valid = data.TabularDataset(
github omerktz / TraFix / open_nmt / onmt / inputters / dataset_base.py View on Github external
def _construct_example_fromlist(self, data, fields):
        """
        Args:
            data: the data to be set as the value of the attributes of
                the to-be-created `Example`, associating with respective
                `Field` objects with same key.
            fields: a dict of `torchtext.data.Field` objects. The keys
                are attributes of the to-be-created `Example`.

        Returns:
            the created `Example` object.
        """
        ex = torchtext.data.Example()
        for (name, field), val in zip(fields, data):
            if field is not None:
                setattr(ex, name, field.preprocess(val))
            else:
                setattr(ex, name, val)
        return ex
github MultiPath / MetaNMT / meta_nmt5.py View on Github external
def dyn_batch_without_padding(new, i, sofar):
    if args.distillation:
        return sofar + max(len(new.src), len(new.trg), len(new.dec))
    else:
        return sofar + max(len(new.src), len(new.trg))


if args.batch_size == 1:  # speed-test: one sentence per batch.
    batch_size_fn = lambda new, count, sofar: count
else:
    batch_size_fn = dyn_batch_with_padding # dyn_batch_without_padding

train_real, dev_real = data.BucketIterator.splits(
    (train_data, dev_data), batch_sizes=(args.batch_size, args.valid_batch_size), device=args.gpu, shuffle=False,
    batch_size_fn=batch_size_fn, repeat=None if args.mode == 'train' else False)
aux_reals = [data.BucketIterator(dataset, batch_size=args.batch_size, device=args.gpu, train=True, batch_size_fn=batch_size_fn, shuffle=False)
            for dataset in aux_data]
logger.info("build the dataset. done!")


# ----------------------------------------------------------------------------------------------------------------- #
# model hyper-params:
logger.info('use default parameters of t2t-base')
hparams = {'d_model': 512, 'd_hidden': 512, 'n_layers': 6,
            'n_heads': 8, 'drop_ratio': 0.1, 'warmup': 16000} # ~32
args.__dict__.update(hparams)

# ----------------------------------------------------------------------------------------------------------------- #
# show the arg:

# hp_str = (f"{args.dataset}_subword_"
#           f"{args.d_model}_{args.d_hidden}_{args.n_layers}_{args.n_heads}_"
github rrkarim / unbounded-cache-lm / main.py View on Github external
opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint
    )
    checkpoint = Checkpoint.load(checkpoint_path)
    seq2seq = checkpoint.model
    input_vocab = checkpoint.input_vocab
    output_vocab = checkpoint.output_vocab
else:
    # Prepare dataset
    src = SourceField()
    tgt = TargetField()
    max_len = 50

    def len_filter(example):
        return len(example.src) <= max_len and len(example.tgt) <= max_len

    train = torchtext.data.TabularDataset(
        path=opt.train_path,
        format="tsv",
        fields=[("src", src), ("tgt", tgt)],
        filter_pred=len_filter,
    )
    dev = torchtext.data.TabularDataset(
        path=opt.dev_path,
        format="tsv",
        fields=[("src", src), ("tgt", tgt)],
        filter_pred=len_filter,
    )
    src.build_vocab(train, max_size=50000)
    tgt.build_vocab(train, max_size=50000)
    input_vocab = src.vocab
    output_vocab = tgt.vocab
github asyml / forte / forte / trainer / ner_trainer.py View on Github external
logger.info(f"Average sentence length: {(lengths / counter):0.3f}")

        train_err = 0.0
        train_total = 0.0

        start_time = time.time()
        self.model.train()

        # Each time we will clear and reload the train_instances_cache
        instances = self.train_instances_cache
        random.shuffle(self.train_instances_cache)
        data_iterator = torchtext.data.iterator.pool(
            instances, self.config_data.batch_size_tokens,
            key=lambda x: x.length(),  # length of word_ids
            batch_size_fn=batch_size_fn,
            random_shuffler=torchtext.data.iterator.RandomShuffler())

        step = 0

        for batch in data_iterator:
            step += 1
            batch_data = self.get_batch_tensor(batch, device=self.device)
            word, char, labels, masks, lengths = batch_data

            self.optim.zero_grad()
            loss = self.model(word, char, labels, mask=masks)
            loss.backward()
            self.optim.step()

            num_inst = word.size(0)
            train_err += loss.item() * num_inst
            train_total += num_inst
github henryhungle / MTN / data_utils.py View on Github external
return tgt_mask

global max_src_in_batch, max_tgt_in_batch
def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new.src))
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new.trg) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)

class MyIterator(data.Iterator):
    def create_batches(self):
        if self.train:
            def pool(d, random_shuffler):
                for p in data.batch(d, self.batch_size * 100):
                    p_batch = data.batch(
                        sorted(p, key=self.sort_key),
                        self.batch_size, self.batch_size_fn)
                    for b in random_shuffler(list(p_batch)):
                        yield b
            self.batches = pool(self.data(), self.random_shuffler)
            
        else:
            self.batches = []
            for b in data.batch(self.data(), self.batch_size,
                                          self.batch_size_fn):
                self.batches.append(sorted(b, key=self.sort_key))
github tatsuokun / context2vec / src / util / batch.py View on Github external
device: int,
                 pad_token='',
                 unk_token='',
                 bos_token='',
                 eos_token='',
                 seed=777):

        numpy.random.seed(seed)
        self.sent_dict = self._gathered_by_lengths(sentences)
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.bos_token = bos_token
        self.eos_token = eos_token
        self.device = device

        self.sentence_field = data.Field(use_vocab=True,
                                         unk_token=self.unk_token,
                                         pad_token=self.pad_token,
                                         init_token=self.bos_token,
                                         eos_token=self.eos_token,
                                         batch_first=True,
                                         include_lengths=False)
        self.sentence_id_field = data.Field(use_vocab=False, batch_first=True)

        self.sentence_field.build_vocab(sentences, min_freq=min_freq)
        self.vocab = self.sentence_field.vocab
        if self.pad_token:
            self.pad_index = self.sentence_field.vocab.stoi[self.pad_token]

        self.dataset = self._create_dataset(self.sent_dict, sentences)