Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_provenance(self):
sample_data, ancestors = self.get_example_data(10, 10, 40)
ancestor_data = tsinfer.AncestorData(sample_data)
self.verify_data_round_trip(sample_data, ancestor_data, ancestors)
self.assertEqual(ancestor_data.num_provenances, sample_data.num_provenances + 1)
timestamp = ancestor_data.provenances_timestamp[-1]
iso = datetime.datetime.now().isoformat()
self.assertEqual(timestamp.split("T")[0], iso.split("T")[0])
record = ancestor_data.provenances_record[-1]
self.assertEqual(record["software"]["name"], "tsinfer")
a = list(ancestor_data.provenances())
self.assertEqual(a[-1][0], timestamp)
self.assertEqual(a[-1][1], record)
for j, (timestamp, record) in enumerate(sample_data.provenances()):
self.assertEqual(timestamp, a[j][0])
self.assertEqual(record, a[j][1])
def verify_round_trip(self, genotypes, exclude_sites):
self.assertEqual(genotypes.shape[0], exclude_sites.shape[0])
with tsinfer.SampleData() as sample_data:
for j in range(genotypes.shape[0]):
sample_data.add_site(j, genotypes[j])
exclude_positions = sample_data.sites_position[:][exclude_sites]
for simplify in [False, True]:
output_ts = tsinfer.infer(
sample_data, simplify=simplify, exclude_positions=exclude_positions
)
for tree in output_ts.trees():
for site in tree.sites():
inf_type = json.loads(site.metadata)["inference_type"]
if exclude_sites[site.id]:
self.assertEqual(inf_type, tsinfer.INFERENCE_FITCH_PARSIMONY)
else:
self.assertEqual(inf_type, tsinfer.INFERENCE_FULL)
f = np.sum(genotypes[site.id])
if f == 0:
def test_zero_sequence_length(self):
# Mangle a sample data file to force a zero sequence length.
ts = msprime.simulate(10, mutation_rate=2, random_seed=5)
with tempfile.TemporaryDirectory(prefix="tsinf_format_test") as tempdir:
filename = os.path.join(tempdir, "samples.tmp")
with tsinfer.SampleData(path=filename) as sample_data:
for var in ts.variants():
sample_data.add_site(var.site.position, var.genotypes)
store = zarr.LMDBStore(filename, subdir=False)
data = zarr.open(store=store, mode="w+")
data.attrs["sequence_length"] = 0
store.close()
sample_data = tsinfer.load(filename)
self.assertEqual(sample_data.sequence_length, 0)
self.assertRaises(ValueError, tsinfer.generate_ancestors, sample_data)
def test_match_ancestors_samples(self):
with tsinfer.SampleData(sequence_length=2) as sample_data:
sample_data.add_site(1, genotypes=[0, 1, 1, 0], alleles=["G", "C"])
ancestor_data = tsinfer.generate_ancestors(sample_data)
# match_ancestors fails when samples unfinalised
unfinalised = tsinfer.SampleData(sequence_length=2)
unfinalised.add_site(1, genotypes=[0, 1, 1, 0], alleles=["G", "C"])
self.assertRaises(
ValueError, tsinfer.match_ancestors, unfinalised, ancestor_data
)
def test_infer(self):
ts = msprime.simulate(10, mutation_rate=1, random_seed=1)
self.assertGreater(ts.num_sites, 1)
samples = tsinfer.SampleData.from_tree_sequence(ts)
inferred_ts = tsinfer.infer(samples)
self.validate_ts(inferred_ts)
def test_large_random_data(self):
n = 100
m = 30
G, positions = get_random_data_example(n, m)
with tsinfer.SampleData(sequence_length=m) as sample_data:
for genotypes, position in zip(G, positions):
sample_data.add_site(position, genotypes)
self.verify(sample_data)
def test_one_sites(self):
ts = msprime.simulate(15, mutation_rate=2, recombination_rate=2, random_seed=3)
sample_data = tsinfer.SampleData.from_tree_sequence(ts)
position = get_default_inference_sites(sample_data)
self.verify(sample_data, position[:1])
def verify(self, sample_data, position_subset):
full_ts = tsinfer.infer(sample_data)
subset_ts = self.subset_sites(full_ts, position_subset)
ancestor_data = tsinfer.generate_ancestors(sample_data)
ancestors_ts = tsinfer.match_ancestors(sample_data, ancestor_data)
subset_ancestors_ts = tsinfer.minimise(
self.subset_sites(ancestors_ts, position_subset)
)
subset_ancestors_ts = subset_ancestors_ts.simplify()
subset_sample_data = tsinfer.SampleData.from_tree_sequence(subset_ts)
output_ts = tsinfer.match_samples(subset_sample_data, subset_ancestors_ts)
self.assertTrue(
np.array_equal(output_ts.genotype_matrix(), subset_ts.genotype_matrix())
)
def test_append_sites_incompatible_files(self):
ts = get_example_individuals_ts_with_metadata(4, 2, 10)
sd1 = tsinfer.SampleData.from_tree_sequence(ts.keep_intervals([[0, 2]]))
mid_ts = ts.keep_intervals([[2, 5]])
sd2 = tsinfer.SampleData.from_tree_sequence(mid_ts)
sd3 = tsinfer.SampleData.from_tree_sequence(ts.keep_intervals([[5, 10]]))
# Fails if altered SD is not in write mode
self.assertRaisesRegexp(ValueError, "build", sd1.append_sites, sd2, sd3)
# Fails if added SDs are in write mode
sd = sd1.copy() # put into write mode
sd.append_sites(sd2, sd3) # now works
self.assertRaisesRegexp(
ValueError, "finalise", sd.append_sites, sd2.copy(), sd3
)
sd = sd1.copy() # put into write mode
# Wrong seq length
sd2 = tsinfer.SampleData.from_tree_sequence(mid_ts.rtrim())
self.assertRaisesRegexp(ValueError, "length", sd.append_sites, sd2, sd3)
# Wrong num samples
sd2 = tsinfer.SampleData.from_tree_sequence(mid_ts.simplify(list(range(7))))
self.assertRaisesRegexp(ValueError, "samples", sd.append_sites, sd2, sd3)
def test_access_individuals(self):
ts = get_example_individuals_ts_with_metadata(5, 2, 10, 1)
sd = tsinfer.SampleData.from_tree_sequence(ts)
self.assertGreater(sd.num_individuals, 0)
has_some_metadata = False
for i, individual in enumerate(sd.individuals()):
if individual.metadata is not None:
has_some_metadata = True # Check that we do compare something sometimes
self.assertEqual(i, individual.id)
other_ind = sd.individual(i)
self.assertEqual(other_ind, individual)
other_ind.samples = []
self.assertNotEqual(other_ind, individual)
self.assertTrue(has_some_metadata)
self.assertEqual(i, sd.num_individuals - 1)