Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
oovs = SymbolEmbSourceNorm(mean, cov, rng, args.use_covariance)
if args.word_rep:
print('Augmenting with pre-trained embeddings...')
else:
print('Augmenting with random embeddings...')
model.embedder.embeddings[0].embeddings.weight.data = torch.from_numpy(
symbol_injection(
id_to_token, len_tok_voc,
model.embedder.embeddings[0].embeddings.weight.data.numpy(),
pre_trained, oovs))
if len_char_voc != len(char_to_id):
print('Augmenting with random char embeddings...')
pre_trained = SymbolEmbSourceText([], None)
cur = model.embedder.embeddings[1].embeddings.weight.data.numpy()
mean = cur.mean(0)
if args.use_covariance:
cov = np.cov(cur, rowvar=False)
else:
cov = cur.std(0)
rng = np.random.RandomState(2)
oovs = SymbolEmbSourceNorm(mean, cov, rng, args.use_covariance)
model.embedder.embeddings[1].embeddings.weight.data = torch.from_numpy(
symbol_injection(
id_to_char, len_char_voc,
model.embedder.embeddings[1].embeddings.weight.data.numpy(),
pre_trained, oovs))
with open(args.data) as f_o:
data, _ = load_data(json.load(f_o), span_only=True, answered_only=True)
data = tokenize_data(data, token_to_id, char_to_id)
id_to_token = {id_: tok for tok, id_ in token_to_id.items()}
id_to_char = {id_: char for char, id_ in char_to_id.items()}
data = get_loader(data, args)
if len_tok_voc != len(token_to_id):
need = set(tok for id_, tok in id_to_token.items()
if id_ >= len_tok_voc)
if args.word_rep:
with open(args.word_rep) as f_o:
pre_trained = SymbolEmbSourceText(
f_o, need)
else:
pre_trained = SymbolEmbSourceText([], need)
cur = model.embedder.embeddings[0].embeddings.weight.data.numpy()
mean = cur.mean(0)
if args.use_covariance:
cov = np.cov(cur, rowvar=False)
else:
cov = cur.std(0)
rng = np.random.RandomState(2)
oovs = SymbolEmbSourceNorm(mean, cov, rng, args.use_covariance)
if args.word_rep:
print('Augmenting with pre-trained embeddings...')
with open(args.data) as f_o:
data, _ = load_data(json.load(f_o), span_only=True, answered_only=True)
data = tokenize_data(data, token_to_id, char_to_id)
id_to_token = {id_: tok for tok, id_ in token_to_id.items()}
id_to_char = {id_: char for char, id_ in char_to_id.items()}
data = get_loader(data, args)
if len_tok_voc != len(token_to_id):
need = set(tok for id_, tok in id_to_token.items()
if id_ >= len_tok_voc)
if args.word_rep:
with open(args.word_rep) as f_o:
pre_trained = SymbolEmbSourceText(
f_o, need)
else:
pre_trained = SymbolEmbSourceText([], need)
cur = model.embedder.embeddings[0].embeddings.weight.data.numpy()
mean = cur.mean(0)
if args.use_covariance:
cov = np.cov(cur, rowvar=False)
else:
cov = cur.std(0)
rng = np.random.RandomState(2)
oovs = SymbolEmbSourceNorm(mean, cov, rng, args.use_covariance)
if args.word_rep:
print('Augmenting with pre-trained embeddings...')
id_to_token = {id_: tok for tok, id_ in token_to_id.items()}
id_to_char = {id_: char for char, id_ in char_to_id.items()}
data = get_loader(data, args)
if len_tok_voc != len(token_to_id):
need = set(tok for id_, tok in id_to_token.items()
if id_ >= len_tok_voc)
if args.word_rep:
with open(args.word_rep) as f_o:
pre_trained = SymbolEmbSourceText(
f_o, need)
else:
pre_trained = SymbolEmbSourceText([], need)
cur = model.embedder.embeddings[0].embeddings.weight.data.numpy()
mean = cur.mean(0)
if args.use_covariance:
cov = np.cov(cur, rowvar=False)
else:
cov = cur.std(0)
rng = np.random.RandomState(2)
oovs = SymbolEmbSourceNorm(mean, cov, rng, args.use_covariance)
if args.word_rep:
print('Augmenting with pre-trained embeddings...')
else:
print('Augmenting with random embeddings...')
with open(args.data) as f_o:
data, _ = load_data(json.load(f_o), span_only=True, answered_only=True)
data = tokenize_data(data, token_to_id, char_to_id)
id_to_token = {id_: tok for tok, id_ in token_to_id.items()}
id_to_char = {id_: char for char, id_ in char_to_id.items()}
data = get_loader(data, args)
if len_tok_voc != len(token_to_id):
need = set(tok for id_, tok in id_to_token.items()
if id_ >= len_tok_voc)
if args.word_rep:
with open(args.word_rep) as f_o:
pre_trained = SymbolEmbSourceText(
f_o, need)
else:
pre_trained = SymbolEmbSourceText([], need)
cur = model.embedder.embeddings[0].embeddings.weight.data.numpy()
mean = cur.mean(0)
if args.use_covariance:
cov = np.cov(cur, rowvar=False)
else:
cov = cur.std(0)
rng = np.random.RandomState(2)
oovs = SymbolEmbSourceNorm(mean, cov, rng, args.use_covariance)
if args.word_rep:
print('Augmenting with pre-trained embeddings...')
with open(args.data) as f_o:
data, _ = load_data(json.load(f_o), span_only=True, answered_only=True)
print('Tokenizing data...')
data = tokenize_data(data, token_to_id, char_to_id)
data = get_loader(data, config)
id_to_token = {id_: tok for tok, id_ in token_to_id.items()}
id_to_char = {id_: char for char, id_ in char_to_id.items()}
print('Creating model...')
model = BidafModel.from_config(config['bidaf'], id_to_token, id_to_char)
if args.word_rep:
print('Loading pre-trained embeddings...')
with open(args.word_rep) as f_o:
pre_trained = SymbolEmbSourceText(
f_o,
set(tok for id_, tok in id_to_token.items() if id_ != 0))
mean, cov = pre_trained.get_norm_stats(args.use_covariance)
rng = np.random.RandomState(2)
oovs = SymbolEmbSourceNorm(mean, cov, rng, args.use_covariance)
model.embedder.embeddings[0].embeddings.weight.data = torch.from_numpy(
symbol_injection(
id_to_token, 0,
model.embedder.embeddings[0].embeddings.weight.data.numpy(),
pre_trained, oovs))
else:
pass # No pretraining, just keep the random values.
# Char embeddings are already random, so we don't need to update them.