Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
parser.add_argument(
"-e", "--error-probability", default=0, type=float,
help="The probability of observing an error")
parser.add_argument(
"-t", "--threads", default=1, type=int,
help="The number of worker threads to use")
parser.add_argument(
"-m", "--method", default="C", choices=['C','P'],
help="Which implementation to use, [C] (faster) or [P]ython (more debuggable)")
parser.add_argument(
"--inject-real-ancestors-from-ts", default=None,
help="Instead of inferring ancestors, construct known ones from this tree sequence file path")
args = parser.parse_args()
try:
sample_data = tsinfer.SampleData.load(args.samples)
except dbm.error:
raise ValueError("No samples file")
if args.inject_real_ancestors_from_ts is not None:
orig_ts = msprime.load(args.inject_real_ancestors_from_ts)
ancestor_data = formats.AncestorData.initialise(sample_data, compressor=None)
evaluation.build_simulated_ancestors(sample_data, ancestor_data, orig_ts)
ancestor_data.finalise()
ancestors_ts = tsinfer.match_ancestors(
sample_data, ancestor_data, method=args.method,
path_compression=args.shared_recombinations)
ts = tsinfer.match_samples(
sample_data, ancestors_ts, method=args.method,
path_compression=args.shared_recombinations,
simplify=True)
else:
ancestor_data = formats.AncestorData.initialise(sample_data, compressor=None)
def run_infer(args):
setup_logging(args)
progress_monitor = ProgressMonitor(
enabled=args.progress,
generate_ancestors=True,
match_ancestors=True,
match_samples=True,
)
sample_data = tsinfer.SampleData.load(args.samples)
ts = tsinfer.infer(
sample_data, progress_monitor=progress_monitor, num_threads=args.num_threads
)
output_trees = get_output_trees_path(args.output_trees, args.samples)
logger.info("Writing output tree sequence to {}".format(output_trees))
ts.dump(output_trees)
summarise_usage()
ret = self.__run_ARGweaver(skip_infer = metrics_only)
elif self.tool == RENTPLUS:
ret = self.__run_RentPlus(skip_infer = metrics_only)
else:
raise ValueError("unknown tool {}".format(self.tool))
if self.inferred_filenames is None:
logging.info("No inferred tree files so metrics skipped for {} row {} = {}".format(
self.tool, int(self.row[0]), ret))
else:
for metric in self.metric_params:
#NB Jerome thinks it may be clearer to have get_metrics() return a single set of metrics
#rather than an average over multiple inferred nexus files, and do the averaging in python
if metric & METRICS_LOCATION_VARIANTS:
#get positions from the samples store, for use in metric calcs
try:
positions = tsinfer.SampleData.load(path=self.cmp_fn + ".samples").sites_position[:].tolist()
except FileNotFoundError:
#no such file exists, could be a case with no sites
continue
else:
positions = None
source_nexus_file = self.cmp_fn + ".nex"
inferred_nexus_files = [fn + ".nex" for fn in self.inferred_filenames]
#here we should create a separate set of metrics for tsinfer with and without polytomy breaking
#we should check if it is TSINFER, and then prepend '' for the default metric and ''
if metric & METRICS_POLYTOMIES_BREAK:
metrics = []
for i in range(self.polytomy_reps):
metrics.append(ARG_metrics.get_metrics(
source_nexus_file, inferred_nexus_files, variant_positions = positions,
randomly_resolve_inferred = int(self.row.seed)+i*11))
metrics = pd.DataFrame(metrics).mean().to_dict()
def run_match_ancestors(args):
setup_logging(args)
ancestors_path = get_ancestors_path(args.ancestors, args.samples)
logger.info("Loading ancestral haplotypes from {}".format(ancestors_path))
ancestors_trees = get_ancestors_trees_path(args.ancestors_trees, args.samples)
sample_data = tsinfer.SampleData.load(args.samples)
ancestor_data = tsinfer.AncestorData.load(ancestors_path)
progress_monitor = ProgressMonitor(enabled=args.progress, match_ancestors=True)
ts = tsinfer.match_ancestors(
sample_data,
ancestor_data,
num_threads=args.num_threads,
progress_monitor=progress_monitor,
path_compression=not args.no_path_compression,
)
logger.info("Writing ancestors tree sequence to {}".format(ancestors_trees))
ts.dump(ancestors_trees)
summarise_usage()
def run_verify(args):
setup_logging(args)
samples = tsinfer.SampleData.load(args.samples)
ts = tskit.load(args.tree_sequence)
progress_monitor = ProgressMonitor(enabled=args.progress, verify=True)
tsinfer.verify(samples, ts, progress_monitor=progress_monitor)
summarise_usage()
def main(infile, outfile):
sample_data = tsinfer.SampleData.load(infile)
print(sample_data)
shape = (sample_data.num_inference_sites, sample_data.num_samples)
G = np.empty(shape, dtype=np.int8)
for j, (_, genotypes) in enumerate(sample_data.genotypes(inference_sites=True)):
G[j] = genotypes
with h5py.File(outfile, "w") as root:
root["haplotypes"] = G.T
ret = self.__run_ARGweaver(skip_infer = metrics_only)
elif self.tool == RENTPLUS:
ret = self.__run_RentPlus(skip_infer = metrics_only)
else:
raise ValueError("unknown tool {}".format(self.tool))
if self.inferred_filenames is None:
logging.info("No inferred tree files so metrics skipped for {} row {} = {}".format(
self.tool, int(self.row[0]), ret))
else:
for metric in self.metric_params:
#NB Jerome thinks it may be clearer to have get_metrics() return a single set of metrics
#rather than an average over multiple inferred nexus files, and do the averaging in python
if metric & METRICS_LOCATION_VARIANTS:
#get positions from the samples store, for use in metric calcs
try:
positions = tsinfer.SampleData.load(path=self.cmp_fn + ".samples").sites_position[:].tolist()
except tsinfer.exceptions.FileFormatError:
#no such file exists, could be a case with no sites
continue
else:
positions = None
source_nexus_file = self.cmp_fn + ".nex"
inferred_nexus_files = [fn + ".nex" for fn in self.inferred_filenames]
#here we should create a separate set of metrics for tsinfer with and without polytomy breaking
#we should check if it is TSINFER, and then prepend '' for the default metric and ''
if metric & METRICS_POLYTOMIES_BREAK:
metrics = []
for i in range(self.polytomy_reps):
metrics.append(ARG_metrics.get_metrics(
source_nexus_file, inferred_nexus_files, variant_positions = positions,
randomly_resolve_inferred = int(self.row.seed)+i*11))
metrics = pd.DataFrame(metrics).mean().to_dict()