Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
positions = pos = np.array([v.position for v in ts.variants()])
S = np.zeros((ts.sample_size, ts.num_mutations), dtype="u1")
for variant in ts.variants():
S[:,variant.index] = variant.genotypes
G = S.astype(np.uint8).T
#Create the ancestors
input_root = zarr.group()
tsinfer.InputFile.build(
input_root, genotypes=G,
# genotype_qualities=tsinfer.proba_to_phred(error_probability),
position=positions,
recombination_rate=rho, sequence_length=ts.sequence_length,
compress=False)
ancestors_root = zarr.group()
#tsinfer.extract_ancestors(ts, ancestors_root)
tsinfer.build_simulated_ancestors(input_root, ancestors_root, ts)
ancestors_ts = tsinfer.match_ancestors(input_root, ancestors_root)
assert ancestors_ts.sequence_length == ts.num_sites
inferred_ts = tsinfer.match_samples(
input_root, ancestors_ts, method="C",
simplify=False)
print("inferred num_edges = ", inferred_ts.num_edges)
def convert_to_zarr(df, store_type, chunks):
"""Anything is possible with ZARR"""
path = _get_temp_path(".zarr")
adj_chunks = (min(df.shape[0], chunks[0]), min(df.shape[1], chunks[1]))
store = getattr(zarr, store_type)(path)
root = zarr.group(store=store)
root.create_dataset("data", data=df.as_matrix(), chunks=adj_chunks, dtype='f4')
root.create_dataset("cell_name", data=df.index.tolist())
root.create_dataset("gene_name", data=df.columns.tolist())
qcs = fake_qc_values(NUM_QC_VALUES, df.index, seed=df.values.sum())
qc_chunks = (min(qcs.shape[0], chunks[0]), min(qcs.shape[1], chunks[1]))
root.create_dataset("qc_values", data=qcs, chunks=qc_chunks)
root.create_dataset("qc_names", data=qcs.columns.tolist())
return path
def __init__(
self,
path=None,
num_flush_threads=0,
compressor=DEFAULT_COMPRESSOR,
chunk_size=1024,
max_file_size=None,
):
self._mode = self.BUILD_MODE
self._num_flush_threads = num_flush_threads
self._chunk_size = max(1, chunk_size)
self._metadata_codec = numcodecs.JSON()
self._compressor = compressor
self.data = zarr.group()
self.path = path
if path is not None:
store = self._new_lmdb_store(max_file_size)
self.data = zarr.open_group(store=store, mode="w")
self.data.attrs[FORMAT_NAME_KEY] = self.FORMAT_NAME
self.data.attrs[FORMAT_VERSION_KEY] = self.FORMAT_VERSION
self.data.attrs["uuid"] = str(uuid.uuid4())
chunks = self._chunk_size
provenances_group = self.data.create_group("provenances")
provenances_group.create_dataset(
"timestamp",
shape=(0,),
chunks=chunks,
compressor=self._compressor,
dtype=object,
data container. The new container will have a different UUID to the
current.
"""
if self._mode != self.READ_MODE:
raise ValueError("Cannot copy unless in read mode.")
if path is not None and self.path is not None:
if os.path.abspath(path) == os.path.abspath(self.path):
raise ValueError("Cannot copy to the same file")
cls = type(self)
other = cls.__new__(cls)
other.path = path
if path is None:
# Have to work around a fairly weird bug in zarr where if we
# try to use copy_store on an in-memory array we end up
# overwriting the original values.
other.data = zarr.group()
zarr.copy_all(source=self.data, dest=other.data)
for key, value in self.data.attrs.items():
other.data.attrs[key] = value
else:
store = other._new_lmdb_store(max_file_size)
zarr.copy_store(self.data.store, store)
other.data = zarr.group(store)
# Set a new UUID
other.data.attrs["uuid"] = str(uuid.uuid4())
other.data.attrs[FINALISED_KEY] = False
other._mode = self.EDIT_MODE
return other
parser.add_argument('-P', '--progress', action='store_true',
help='Show a progress bar.')
args = parser.parse_args()
method, path_compression, simplify = "C", True, True #set defaults
for i, fn in enumerate(args.infiles):
ext = ('.' + str(i)) if len(args.infiles) > 1 else ''
if args.outfile:
out_fn = args.outfile + ext
else:
out_fn = os.path.splitext(fn)[0] + '.hdf5'
if not os.path.isfile(fn):
raise FileNotFoundError
input_hdf5 = zarr.DBMStore(fn, open=bsddb3.btopen)
input_root = zarr.group(store=input_hdf5)
ancestors_root = zarr.group()
tsinfer.build_ancestors(
input_root, ancestors_root, method=method, chunk_size=16, compress=False,
progress = args.progress)
ancestors_ts = tsinfer.match_ancestors(
input_root, ancestors_root, method=method, path_compression=path_compression,
progress = args.progress)
full_inferred_ts = tsinfer.match_samples(
input_root, ancestors_ts, method=method, path_compression=path_compression,
simplify=simplify, progress = args.progress)
full_inferred_ts.dump(out_fn)
model: Model,
zobject: Optional[Union[zarr.Group, MutableMapping, str]] = None,
encoding: Optional[EncodingDict] = None,
batch_dim: Optional[str] = None,
lock: Optional[Any] = None,
):
self.dataset = dataset
self.model = model
self.in_memory = False
self.consolidated = False
if isinstance(zobject, zarr.Group):
self.zgroup = zobject
elif zobject is None:
self.zgroup = zarr.group(store=zarr.MemoryStore())
self.in_memory = True
else:
self.zgroup = zarr.group(store=zobject)
self.output_vars = dataset.xsimlab.output_vars_by_clock
self.output_save_steps = dataset.xsimlab.get_output_save_steps()
if encoding is None:
encoding = {}
self.var_info = _get_var_info(dataset, model, encoding)
self.batch_dim = batch_dim
self.batch_size = get_batch_size(dataset, batch_dim)
self.mclock_dim = dataset.xsimlab.master_clock_dim
physical_dist = physical_pos - last_physical_pos
genetic_dist = genetic_pos - last_genetic_pos
scaled_recomb_rate = 0
if genetic_dist > 0:
scaled_recomb_rate = physical_dist / genetic_dist
recombination_rates[i]=scaled_recomb_rate
except FileNotFoundError:
print("Genetic map file {} not found, defaulting to constant recombination rate of {}".format(
genetic_map_file, args.recombination_rate))
output_file = args.outfile + str(c) + ".tsinf"
if os.path.exists(output_file):
os.unlink(output_file)
input_hdf5 = zarr.DBMStore(output_file, open=bsddb3.btopen)
root = zarr.group(store=input_hdf5, overwrite=True)
tsinfer.InputFile.build(
root,
genotypes=sites_by_samples,
position=list(dat['position'].keys()),
recombination_rate=recombination_rates)
#sample_names=[s.encode() for s in reduced_rows]
input_hdf5.close()
print("Saved {} biallelic loci for {} samples into {}".format(len(dat['position']), len(reduced_rows), output_file))
"""
Then do something like
raise ValueError("Cannot copy to the same file")
cls = type(self)
other = cls.__new__(cls)
other.path = path
if path is None:
# Have to work around a fairly weird bug in zarr where if we
# try to use copy_store on an in-memory array we end up
# overwriting the original values.
other.data = zarr.group()
zarr.copy_all(source=self.data, dest=other.data)
for key, value in self.data.attrs.items():
other.data.attrs[key] = value
else:
store = other._new_lmdb_store(max_file_size)
zarr.copy_store(self.data.store, store)
other.data = zarr.group(store)
# Set a new UUID
other.data.attrs["uuid"] = str(uuid.uuid4())
other.data.attrs[FINALISED_KEY] = False
other._mode = self.EDIT_MODE
return other