Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
test_examples = [data.Example.fromlist(i, datafields) for i in test_df.values.tolist()]
test_data = data.Dataset(test_examples, datafields)
# If validation file exists, load it. Otherwise get validation data from training data
if val_file:
val_df = self.get_pandas_df(val_file)
val_examples = [data.Example.fromlist(i, datafields) for i in val_df.values.tolist()]
val_data = data.Dataset(val_examples, datafields)
else:
train_data, val_data = train_data.split(split_ratio=0.8)
TEXT.build_vocab(train_data, vectors=Vectors(w2v_file))
self.word_embeddings = TEXT.vocab.vectors
self.vocab = TEXT.vocab
self.train_iterator = data.BucketIterator(
(train_data),
batch_size=self.config.batch_size,
sort_key=lambda x: len(x.text),
repeat=False,
shuffle=True)
self.val_iterator, self.test_iterator = data.BucketIterator.splits(
(val_data, test_data),
batch_size=self.config.batch_size,
sort_key=lambda x: len(x.text),
repeat=False,
shuffle=False)
print ("Loaded {} training examples".format(len(train_data)))
print ("Loaded {} test examples".format(len(test_data)))
print ("Loaded {} validation examples".format(len(val_data)))
def get_msrp_iter(args):
if not os.path.exists(args.data_dir):
os.mkdir(args.data_dir)
TEXT = data.Field(lower=True, tokenize=tokenize_line_en)
LABELS = data.Field(batch_first=True)
train, val, test = MSRPDataset.splits(fields=(('sentence1', TEXT), ('sentence2', TEXT), ('labels', LABELS)), root=args.data_dir)
TEXT.build_vocab(chain(train.sentence1, train.sentence2))
LABELS.build_vocab(train.labels)
print('Number of train dataset:', len(train))
print('Number of validation dataset:', len(test))
train_iter, val_iter, test_iter = data.BucketIterator.splits((train, val, test), batch_size=args.batch_size, device='cuda' if torch.cuda.is_available() else None)
return train_iter, val_iter, test_iter, TEXT.vocab, LABELS.vocab
def load_data(text_field, label_field, **kwargs):
train_data, test_data, _ = SST.splits(text_field, label_field,
filter_pred=lambda ex: ex.label != 'neutral')
text_field.build_vocab(train_data, vectors=GloVe()),
label_field.build_vocab(train_data, test_data)
train_iter, test_iter = data.BucketIterator.splits(
(train_data, test_data),
batch_sizes=(args.batch_size, args.batch_size),
shuffle=args.shuffle,
**kwargs
)
return train_iter, test_iter
)
de_field = data.Field(
lower=True, tokenize=tokenize_de, batch_first=True, init_token='', eos_token=''
)
train_source, val_source, test_source = WMT14Cached.splits(
root=path,
exts=('.en', '.de'),
fields=(en_field, de_field)
)
en_field.build_vocab(train_source.src, min_freq=2)
de_field.build_vocab(train_source.tgt, max_size=17_000)
train_iter, val_iter, test_iter = data.BucketIterator.splits(
(train_source, val_source, test_source),
batch_size=batch_size,
repeat=False
)
return SupervisedTextData(
train_source, val_source, train_iter, val_iter, en_field, de_field
)
os.makedirs(dataset_path)
torch.save(self.train.examples, train_examples_path)
torch.save(self.dev.examples, dev_examples_path)
#cut too long context in the training set for efficiency.
if args.context_threshold > 0:
self.train.examples = [e for e in self.train.examples if len(e.c_word) <= args.context_threshold]
print("building vocab...")
self.CHAR.build_vocab(self.train, self.dev)
self.WORD.build_vocab(self.train, self.dev, vectors=GloVe(name='6B', dim=args.word_dim))
print("building iterators...")
device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
self.train_iter, self.dev_iter = \
data.BucketIterator.splits((self.train, self.dev),
batch_sizes=[args.train_batch_size, args.dev_batch_size],
device=device,
sort_key=lambda x: len(x.c_word))
def _train_epoches(self, data, model, n_epochs, start_epoch, start_step,
dev_data=None, teacher_forcing_ratio=0):
log = self.logger
print_loss_total = 0 # Reset every print_every
epoch_loss_total = 0 # Reset every epoch
device = None if torch.cuda.is_available() else -1
batch_iterator = torchtext.data.BucketIterator(
dataset=data, batch_size=self.batch_size,
sort_key=lambda x: -len(x.src),
device=device, repeat=False)
steps_per_epoch = len(batch_iterator)
total_steps = steps_per_epoch * n_epochs
step = start_step
step_elapsed = 0
for epoch in range(start_epoch, n_epochs + 1):
log.debug("Epoch: %d, Step: %d" % (epoch, step))
batch_generator = batch_iterator.__iter__()
# consuming seen batches from previous training
for _ in range((epoch - 1) * steps_per_epoch, step):
next(batch_generator)
opt.trg_pad_idx = data['vocab']['trg'].vocab.stoi[Constants.PAD_WORD]
opt.src_vocab_size = len(data['vocab']['src'].vocab)
opt.trg_vocab_size = len(data['vocab']['trg'].vocab)
#========= Preparing Model =========#
if opt.embs_share_weight:
assert data['vocab']['src'].vocab.stoi == data['vocab']['trg'].vocab.stoi, \
'To sharing word embedding the src/trg word2idx table shall be the same.'
fields = {'src': data['vocab']['src'], 'trg':data['vocab']['trg']}
train = Dataset(examples=data['train'], fields=fields)
val = Dataset(examples=data['valid'], fields=fields)
train_iterator = BucketIterator(train, batch_size=batch_size, device=device, train=True)
val_iterator = BucketIterator(val, batch_size=batch_size, device=device)
return train_iterator, val_iterator
batch_size = 64
inputs = data.Field(lower=True)
answers = data.Field(sequential=False)
train, dev, test = datasets.SNLI.splits(inputs, answers)
inputs.build_vocab(train, dev, test)
vector = os.path.join(USERHOME, '.vector_cache', 'glove.6B.300d.txt.pt')
if os.path.isfile(vector):
# TODO - make it customizable
inputs.vocab.vectors = torch.load(vector)
else:
inputs.vocab.load_vectors('glove.6B.300d')
answers.build_vocab(train)
train_iter, dev_iter, test_iter = data.BucketIterator.splits(
(train, dev, test), batch_size=batch_size)
vocab_dim = len(inputs.vocab)
out_dim = len(answers.vocab)
embed_dim = 300
cells = 2
birnn = True
lr = 0.01
epochs = 10
if birnn:
cells *= 2
dropout = 0.5
fc1_dim = 50
fc2_dim = 3
n_layers = 2
network_type = 'LSTM'
train_data, dev_data, test_data = data.TabularDataset.splits(
path=data_path,
train='snli_1.0_train.jsonl',
validation='snli_1.0_dev.jsonl',
test='snli_1.0_test.jsonl',
format='json',
fields=fields,
filter_pred=lambda ex: ex.label != '-' # filter the example which label is '-'(means unlabeled)
)
if vectors is not None:
TEXT.build_vocab(train_data, vectors=vectors, unk_init=torch.Tensor.normal_)
else:
TEXT.build_vocab(train_data)
LABEL.build_vocab(dev_data)
train_iter, dev_iter = BucketIterator.splits(
(train_data, dev_data),
batch_sizes=(batch_size, batch_size),
device=device,
sort_key=lambda x: len(x.premise) + len(x.hypothesis),
sort_within_batch=True,
repeat=False,
shuffle=True
)
test_iter = Iterator(test_data,
batch_size=batch_size,
device=device,
sort=False,
sort_within_batch=False,
repeat=False,
shuffle=False)
def get_data_loaders(self, w2v_file, train_file=None, validation_file=None,
train_comments=None, train_rating=None,
validation_comment=None, validation_ratings=None):
train_dataset = self.get_dataset(comments=train_comments,
ratings=train_rating,
review_file=train_file)
validation_dataset = self.get_dataset(comments=validation_comment,
ratings=validation_ratings,
review_file=validation_file)
datasets = [train_dataset, validation_dataset]
self._build_vocabs(datasets, w2v_file)
train_loader = BucketIterator(train_dataset, config.BATCH_SIZE)
validation_loader = BucketIterator(validation_dataset, config.BATCH_SIZE)
return train_loader, validation_loader