Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def start_sampler():
g = generate_rand_graph(100)
namebook = { 0:'127.0.0.1:50051' }
sender = dgl.contrib.sampling.SamplerSender(namebook)
for i, subg in enumerate(dgl.contrib.sampling.NeighborSampler(
g, 1, 100, neighbor_type='in', num_workers=4)):
sender.send(subg, 0)
sender.signal(0)
def test_random_walk_with_restart():
edge_list = [(0, 1), (1, 2), (2, 3), (3, 4),
(4, 3), (3, 2), (2, 1), (1, 0)]
seeds = [0, 1]
max_nodes = 10
g = dgl.DGLGraph(edge_list)
# test normal RWR
traces = dgl.contrib.sampling.random_walk_with_restart(g, seeds, 0.2, max_nodes)
assert len(traces) == len(seeds)
for traces_per_seed in traces:
total_nodes = 0
for t in traces_per_seed:
total_nodes += len(t)
trace_diff = np.diff(F.zerocopy_to_numpy(t), axis=-1)
assert (np.abs(trace_diff) == 1).all()
assert total_nodes >= max_nodes
# test RWR with early stopping
traces = dgl.contrib.sampling.random_walk_with_restart(
g, seeds, 1, 100, max_nodes, 1)
assert len(traces) == len(seeds)
for traces_per_seed in traces:
assert sum(len(t) for t in traces_per_seed) < 100
def test_metapath_random_walk():
g1 = dgl.bipartite(([0, 1, 2, 3], [0, 1, 2, 3]), 'a', 'ab', 'b')
g2 = dgl.bipartite(([0, 0, 1, 1, 2, 2, 3, 3], [1, 3, 2, 0, 3, 1, 0, 2]), 'b', 'ba', 'a')
G = dgl.hetero_from_relations([g1, g2])
seeds = [0, 1]
traces = dgl.contrib.sampling.metapath_random_walk(G, ['ab', 'ba'] * 4, seeds, 3)
for seed, traces_per_seed in zip(seeds, traces):
assert len(traces_per_seed) == 3
for trace in traces_per_seed:
assert len(trace) == 8
trace = np.insert(F.asnumpy(trace), 0, seed)
for i in range(4):
assert g1.has_edge_between(trace[2 * i], trace[2 * i + 1])
assert g2.has_edge_between(trace[2 * i + 1], trace[2 * i + 2])
def train_on_subgraphs(g, label_nodes, batch_size,
steady_state_operator, predictor, trainer):
# To train SSE, we create two subgraph samplers with the
# `NeighborSampler` API for each phase.
# The first phase samples from all vertices in the graph.
sampler = dgl.contrib.sampling.NeighborSampler(
g, batch_size, g.number_of_nodes(), num_hops=1)
sampler_iter = iter(sampler)
# The second phase only samples from labeled vertices.
sampler_train = dgl.contrib.sampling.NeighborSampler(
g, batch_size, g.number_of_nodes(), seed_nodes=label_nodes, num_hops=1)
sampler_train_iter = iter(sampler_train)
for i in range(n_embedding_updates):
subg = next(sampler_iter)
# Currently, subgraphing does not copy or share features
# automatically. Therefore, we need to copy the node
# embeddings of the subgraph from the parent graph with
# `copy_from_parent()` before computing...
subg.copy_from_parent()
update_embeddings_subgraph(subg, steady_state_operator)
# ... and copy them back to the parent graph.
g.ndata['h'][subg.layer_parent_nid(-1)] = subg.layers[-1].data['h']
for i in range(n_parameter_updates):
try:
subg = next(sampler_train_iter)
n_classes,
n_layers,
mx.nd.relu,
prefix='GCN')
infer_model.initialize(ctx=ctx)
# use optimizer
print(model.collect_params())
kv_type = 'dist_sync' if distributed else 'local'
trainer = gluon.Trainer(model.collect_params(), 'adam',
{'learning_rate': args.lr, 'wd': args.weight_decay},
kvstore=mx.kv.create(kv_type))
# Create sampler receiver
sampler = dgl.contrib.sampling.SamplerReceiver(graph=g, addr=args.ip, num_sender=args.num_sampler)
# initialize graph
dur = []
adj = g.adjacency_matrix().as_in_context(g_ctx)
for epoch in range(args.n_epochs):
start = time.time()
if distributed:
msg_head = "Worker {:d}, epoch {:d}".format(g.worker_id, epoch)
else:
msg_head = "epoch {:d}".format(epoch)
for nf in sampler:
for i in range(n_layers):
agg_history_str = 'agg_h_{}'.format(i)
dests = nf.layer_parent_nid(i+1).as_in_context(g_ctx)
# TODO we could use DGLGraph.pull to implement this, but the current
# implementation of pull is very slow. Let's manually do it for now.
elif i % args.gpu == 0:
trainer.step(len(seeds) * len(losses))
for loss in losses:
train_loss += loss.asnumpy()[0]
losses = []
if i % args.num_parallel_subgraphs == 0:
end1 = time.time()
print("process " + str(args.num_parallel_subgraphs)
+ " subgraphs takes " + str(end1 - start1))
start1 = end1
if args.cache_subgraph:
sampler.restart()
else:
sampler = dgl.contrib.sampling.NeighborSampler(g, args.batch_size, neigh_expand,
neighbor_type='in',
num_workers=args.num_parallel_subgraphs,
seed_nodes=train_vs, shuffle=True)
# test set accuracy
logits = model_infer(g, eval_vs)
y_bar = mx.nd.argmax(logits, axis=1)
y = eval_labels
accuracy = mx.nd.sum(y_bar == y) / len(y)
accuracy = accuracy.asnumpy()[0]
# update the inference model.
infer_params = model_infer.collect_params()
for key in infer_params:
idx = trainer._param2idx[key]
trainer._kvstore.pull(idx, out=infer_params[key].data())
nf.copy_to_parent(node_embed_names=node_embed_names)
mx.nd.waitall()
print(msg_head + ': training takes ' + str(time.time() - start))
infer_params = infer_model.collect_params()
for key in infer_params:
idx = trainer._param2idx[key]
trainer._kvstore.pull(idx, out=infer_params[key].data())
num_acc = 0.
num_tests = 0
if not distributed or g.worker_id == 0:
for nf in dgl.contrib.sampling.NeighborSampler(g, args.test_batch_size,
g.number_of_nodes(),
neighbor_type='in',
num_hops=n_layers,
seed_nodes=test_nid,
add_self_loop=True):
node_embed_names = [['preprocess', 'features']]
for i in range(n_layers):
node_embed_names.append(['norm', 'subg_norm'])
nf.copy_from_parent(node_embed_names=node_embed_names, ctx=ctx)
pred = infer_model(nf)
batch_nids = nf.layer_parent_nid(-1)
batch_labels = g.nodes[batch_nids].data['labels'].as_in_context(ctx)
num_acc += (pred.argmax(axis=1) == batch_labels).sum().asscalar()
num_tests += nf.layer_size(-1)
if distributed:
nf.copy_to_parent(node_embed_names=node_embed_names)
mx.nd.waitall()
print(msg_head + ': training takes ' + str(time.time() - start))
infer_params = infer_model.collect_params()
for key in infer_params:
idx = trainer._param2idx[key]
trainer._kvstore.pull(idx, out=infer_params[key].data())
num_acc = 0.
num_tests = 0
if not distributed or g.worker_id == 0:
for nf in dgl.contrib.sampling.NeighborSampler(g, args.test_batch_size,
g.number_of_nodes(),
neighbor_type='in',
num_hops=n_layers,
seed_nodes=test_nid):
node_embed_names = [['preprocess']]
for i in range(n_layers):
node_embed_names.append(['norm'])
nf.copy_from_parent(node_embed_names=node_embed_names, ctx=ctx)
pred = infer_model(nf)
batch_nids = nf.layer_parent_nid(-1)
batch_labels = g.nodes[batch_nids].data['labels'].as_in_context(ctx)
num_acc += (pred.argmax(axis=1) == batch_labels).sum().asscalar()
num_tests += nf.layer_size(-1)
if distributed:
g._sync_barrier()
optimizer.zero_grad()
loss.backward()
optimizer.step()
node_embed_names = [['h_{}'.format(i)] for i in range(n_layers)]
node_embed_names.append([])
nf.copy_to_parent(node_embed_names=node_embed_names)
for infer_param, param in zip(infer_model.parameters(), model.parameters()):
infer_param.data.copy_(param.data)
num_acc = 0.
for nf in dgl.contrib.sampling.NeighborSampler(g, args.test_batch_size,
g.number_of_nodes(),
neighbor_type='in',
num_workers=32,
num_hops=n_layers,
seed_nodes=test_nid):
node_embed_names = [['preprocess']]
for i in range(n_layers):
node_embed_names.append(['norm'])
nf.copy_from_parent(node_embed_names=node_embed_names)
infer_model.eval()
with torch.no_grad():
pred = infer_model(nf)
batch_nids = nf.layer_parent_nid(-1).to(device=pred.device).long()
batch_labels = labels[batch_nids]
num_acc += (pred.argmax(dim=1) == batch_labels).sum().cpu().item()
nf.copy_to_parent(node_embed_names=node_embed_names)
mx.nd.waitall()
print(msg_head + ': training takes ' + str(time.time() - start))
infer_params = infer_model.collect_params()
for key in infer_params:
idx = trainer._param2idx[key]
trainer._kvstore.pull(idx, out=infer_params[key].data())
num_acc = 0.
num_tests = 0
if not distributed or g.worker_id == 0:
for nf in dgl.contrib.sampling.NeighborSampler(g, args.test_batch_size,
g.number_of_nodes(),
neighbor_type='in',
num_hops=n_layers,
seed_nodes=test_nid):
node_embed_names = [['preprocess']]
for i in range(n_layers):
node_embed_names.append(['norm'])
nf.copy_from_parent(node_embed_names=node_embed_names, ctx=ctx)
pred = infer_model(nf)
batch_nids = nf.layer_parent_nid(-1)
batch_labels = g.nodes[batch_nids].data['labels'].as_in_context(ctx)
num_acc += (pred.argmax(axis=1) == batch_labels).sum().asscalar()
num_tests += nf.layer_size(-1)
if distributed:
g._sync_barrier()