Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_zero_sequence_length(self):
# Mangle a sample data file to force a zero sequence length.
ts = msprime.simulate(10, mutation_rate=2, random_seed=5)
with tempfile.TemporaryDirectory(prefix="tsinf_format_test") as tempdir:
filename = os.path.join(tempdir, "samples.tmp")
with tsinfer.SampleData(path=filename) as sample_data:
for var in ts.variants():
sample_data.add_site(var.site.position, var.genotypes)
store = zarr.LMDBStore(filename, subdir=False)
data = zarr.open(store=store, mode="w+")
data.attrs["sequence_length"] = 0
store.close()
sample_data = tsinfer.load(filename)
self.assertEqual(sample_data.sequence_length, 0)
self.assertRaises(ValueError, tsinfer.generate_ancestors, sample_data)
def test_path_compression_parent_child_identical_times(self):
# This provoked a bug in which we created a pc ancestor
# with the same time as its child, creating an invalid topology.
sample_data = tsinfer.load("tests/data/bugs/invalid_pc_ancestor_time.samples")
ts = tsinfer.infer(sample_data)
for var, (_, genotypes) in zip(ts.variants(), sample_data.genotypes()):
self.assertTrue(np.array_equal(var.genotypes, genotypes))
)
self.ancestor_trees = str(
pathlib.Path(self.tempdir.name, "input-data.ancestors.trees")
)
self.output_trees = str(pathlib.Path(self.tempdir.name, "input-data.trees"))
self.input_ts = msprime.simulate(
10, mutation_rate=10, recombination_rate=10, random_seed=10
)
sample_data = tsinfer.SampleData(
sequence_length=self.input_ts.sequence_length, path=self.sample_file
)
for var in self.input_ts.variants():
sample_data.add_site(var.site.position, var.genotypes, var.alleles)
sample_data.finalise()
tsinfer.generate_ancestors(sample_data, path=self.ancestor_file, chunk_size=10)
ancestor_data = tsinfer.load(self.ancestor_file)
ancestors_ts = tsinfer.match_ancestors(sample_data, ancestor_data)
ancestors_ts.dump(self.ancestor_trees)
ts = tsinfer.match_samples(sample_data, ancestors_ts)
ts.dump(self.output_trees)
sample_data.close()
def test_defaults_with_path(self):
ts = get_example_ts(10, 10)
with tempfile.TemporaryDirectory(prefix="tsinf_format_test") as tempdir:
filename = os.path.join(tempdir, "samples.tmp")
input_file = formats.SampleData(
path=filename, sequence_length=ts.sequence_length
)
self.verify_data_round_trip(ts, input_file)
compressor = formats.DEFAULT_COMPRESSOR
for _, array in input_file.arrays():
self.assertEqual(array.compressor, compressor)
with tsinfer.load(filename) as other:
self.assertEqual(other, input_file)
def run_combine_ukbb_1kg(args):
ukbb_samples_file = "ukbb_{}.samples".format(args.chromosome)
tg_ancestors_ts_file = "1kg_{}.trees".format(args.chromosome)
ancestors_ts_file = "1kg_ukbb_{}.ancestors.trees".format(args.chromosome)
samples_file = "1kg_ukbb_{}.samples".format(args.chromosome)
ukbb_samples = tsinfer.load(ukbb_samples_file)
tg_ancestors_ts = tskit.load(tg_ancestors_ts_file)
print("Loaded ts:", tg_ancestors_ts.num_nodes, tg_ancestors_ts.num_edges)
# Subset the sites down to the UKBB sites.
tables = tg_ancestors_ts.dump_tables()
ukbb_sites = set(ukbb_samples.sites_position[:])
ancestors_sites = set(tables.sites.position[:])
intersecting_sites = ancestors_sites & ukbb_sites
print("Intersecting sites = ", len(intersecting_sites))
tables.sites.clear()
tables.mutations.clear()
for site in tg_ancestors_ts.sites():
if site.position in intersecting_sites:
# Sites must be 0/1 for the ancestors ts.
site_id = tables.sites.add_row(
parser.add_argument(
"-n", "--use_first_n_samples", type=int,
help="Only use the first n (diploid) samples from UK-biobank")
parser.add_argument(
"-p", "--progress", action="store_true",
help="Show progress bars and output extra information when done")
args = parser.parse_args()
if not os.path.exists(args.bgen_file):
raise ValueError("{} does not exist".format(args.bgen_file))
if not os.path.exists(args.sampledata_file):
raise ValueError("{} does not exist".format(args.sampledata_file))
bgen = bgen_reader.read_bgen(args.bgen_file, verbose=False, size = 500)
sample_data = tsinfer.load(args.sampledata_file)
create_datafile(sample_data, bgen, args.outfile, show_progress=args.progress)
help="Instead of inferring ancestors, construct known ones from this tree sequence file path")
parser.add_argument(
"-V", "--version", action='version', version=description)
args = parser.parse_args()
if args.recombination_rate is not None:
logging.warning("TSinfer now simply ignores recombination rate. You can omit this parameter")
if args.error_probability is not None:
logging.warning("TSinfer now simply ignores error probabilities. You can omit this parameter")
engine = tsinfer.PY_ENGINE if args.method == "P" else tsinfer.C_ENGINE
if not os.path.isfile(args.samples):
raise ValueError("No samples file")
sample_data = tsinfer.load(args.samples)
if all(False for _ in sample_data.genotypes(inference_sites=True)):
raise ValueError("No inference sites")
if args.inject_real_ancestors_from_ts is not None:
ancestor_data = tsinfer.AncestorData.initialise(sample_data, compressor=None)
orig_ts = msprime.load(args.inject_real_ancestors_from_ts)
eval_util.build_simulated_ancestors(sample_data, ancestor_data, orig_ts)
ancestor_data.finalise()
ancestors_ts = tsinfer.match_ancestors(
sample_data, ancestor_data, path_compression=args.shared_recombinations, engine=engine)
ts = tsinfer.match_samples(
sample_data, ancestors_ts, path_compression=args.shared_recombinations, engine=engine, simplify=True)
else:
ts = tsinfer.infer(
sample_data, num_threads=args.threads, path_compression=args.shared_recombinations, engine=engine)
ts.dump(args.output)