How to use the zarr.group function in zarr

To help you get started, we’ve selected a few zarr examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github tskit-dev / tsinfer / tests / testcase_fail.py View on Github external
positions = pos = np.array([v.position for v in ts.variants()])
S = np.zeros((ts.sample_size, ts.num_mutations), dtype="u1")
for variant in ts.variants():
    S[:,variant.index] = variant.genotypes

G = S.astype(np.uint8).T

#Create the ancestors
input_root = zarr.group()
tsinfer.InputFile.build(
    input_root, genotypes=G,
    # genotype_qualities=tsinfer.proba_to_phred(error_probability),
    position=positions,
    recombination_rate=rho, sequence_length=ts.sequence_length,
    compress=False)
ancestors_root = zarr.group()


#tsinfer.extract_ancestors(ts, ancestors_root)
tsinfer.build_simulated_ancestors(input_root, ancestors_root, ts)

ancestors_ts = tsinfer.match_ancestors(input_root, ancestors_root)
assert ancestors_ts.sequence_length == ts.num_sites
inferred_ts = tsinfer.match_samples(
    input_root, ancestors_ts, method="C",
    simplify=False)

print("inferred num_edges = ", inferred_ts.num_edges)
github HumanCellAtlas / table-testing / create_data / converters.py View on Github external
def convert_to_zarr(df, store_type, chunks):
    """Anything is possible with ZARR"""

    path = _get_temp_path(".zarr")
    adj_chunks = (min(df.shape[0], chunks[0]), min(df.shape[1], chunks[1]))

    store = getattr(zarr, store_type)(path)
    root = zarr.group(store=store)

    root.create_dataset("data", data=df.as_matrix(), chunks=adj_chunks, dtype='f4')
    root.create_dataset("cell_name", data=df.index.tolist())
    root.create_dataset("gene_name", data=df.columns.tolist())

    qcs = fake_qc_values(NUM_QC_VALUES, df.index, seed=df.values.sum())
    qc_chunks = (min(qcs.shape[0], chunks[0]), min(qcs.shape[1], chunks[1]))
    root.create_dataset("qc_values", data=qcs, chunks=qc_chunks)
    root.create_dataset("qc_names", data=qcs.columns.tolist())

    return path
github tskit-dev / tsinfer / tsinfer / formats.py View on Github external
def __init__(
        self,
        path=None,
        num_flush_threads=0,
        compressor=DEFAULT_COMPRESSOR,
        chunk_size=1024,
        max_file_size=None,
    ):
        self._mode = self.BUILD_MODE
        self._num_flush_threads = num_flush_threads
        self._chunk_size = max(1, chunk_size)
        self._metadata_codec = numcodecs.JSON()
        self._compressor = compressor
        self.data = zarr.group()
        self.path = path
        if path is not None:
            store = self._new_lmdb_store(max_file_size)
            self.data = zarr.open_group(store=store, mode="w")
        self.data.attrs[FORMAT_NAME_KEY] = self.FORMAT_NAME
        self.data.attrs[FORMAT_VERSION_KEY] = self.FORMAT_VERSION
        self.data.attrs["uuid"] = str(uuid.uuid4())

        chunks = self._chunk_size
        provenances_group = self.data.create_group("provenances")
        provenances_group.create_dataset(
            "timestamp",
            shape=(0,),
            chunks=chunks,
            compressor=self._compressor,
            dtype=object,
github tskit-dev / tsinfer / tsinfer / formats.py View on Github external
data container. The new container will have a different UUID to the
        current.
        """
        if self._mode != self.READ_MODE:
            raise ValueError("Cannot copy unless in read mode.")
        if path is not None and self.path is not None:
            if os.path.abspath(path) == os.path.abspath(self.path):
                raise ValueError("Cannot copy to the same file")
        cls = type(self)
        other = cls.__new__(cls)
        other.path = path
        if path is None:
            # Have to work around a fairly weird bug in zarr where if we
            # try to use copy_store on an in-memory array we end up
            # overwriting the original values.
            other.data = zarr.group()
            zarr.copy_all(source=self.data, dest=other.data)
            for key, value in self.data.attrs.items():
                other.data.attrs[key] = value
        else:
            store = other._new_lmdb_store(max_file_size)
            zarr.copy_store(self.data.store, store)
            other.data = zarr.group(store)
        # Set a new UUID
        other.data.attrs["uuid"] = str(uuid.uuid4())
        other.data.attrs[FINALISED_KEY] = False
        other._mode = self.EDIT_MODE
        return other
github tskit-dev / tsinfer / run_tsinf.py View on Github external
parser.add_argument('-P', '--progress', action='store_true',
                    help='Show a progress bar.')
args = parser.parse_args()

method, path_compression, simplify = "C", True, True #set defaults

for i, fn in enumerate(args.infiles):
    ext = ('.' + str(i)) if len(args.infiles) > 1 else ''
    if args.outfile:
        out_fn = args.outfile + ext
    else:
        out_fn = os.path.splitext(fn)[0] + '.hdf5'
    if not os.path.isfile(fn):
        raise FileNotFoundError
    input_hdf5 = zarr.DBMStore(fn, open=bsddb3.btopen)
    input_root = zarr.group(store=input_hdf5)
    
    ancestors_root = zarr.group()
    tsinfer.build_ancestors(
        input_root, ancestors_root, method=method, chunk_size=16, compress=False,
        progress = args.progress)
    ancestors_ts = tsinfer.match_ancestors(
        input_root, ancestors_root, method=method, path_compression=path_compression,
        progress = args.progress)
    full_inferred_ts = tsinfer.match_samples(
        input_root, ancestors_ts, method=method, path_compression=path_compression,
        simplify=simplify, progress = args.progress)
    full_inferred_ts.dump(out_fn)
github benbovy / xarray-simlab / xsimlab / stores.py View on Github external
model: Model,
        zobject: Optional[Union[zarr.Group, MutableMapping, str]] = None,
        encoding: Optional[EncodingDict] = None,
        batch_dim: Optional[str] = None,
        lock: Optional[Any] = None,
    ):
        self.dataset = dataset
        self.model = model

        self.in_memory = False
        self.consolidated = False

        if isinstance(zobject, zarr.Group):
            self.zgroup = zobject
        elif zobject is None:
            self.zgroup = zarr.group(store=zarr.MemoryStore())
            self.in_memory = True
        else:
            self.zgroup = zarr.group(store=zobject)

        self.output_vars = dataset.xsimlab.output_vars_by_clock
        self.output_save_steps = dataset.xsimlab.get_output_save_steps()

        if encoding is None:
            encoding = {}

        self.var_info = _get_var_info(dataset, model, encoding)

        self.batch_dim = batch_dim
        self.batch_size = get_batch_size(dataset, batch_dim)

        self.mclock_dim = dataset.xsimlab.master_clock_dim
github tskit-dev / tsinfer / vcf2tsinfer.py View on Github external
physical_dist = physical_pos - last_physical_pos
                genetic_dist = genetic_pos - last_genetic_pos
                scaled_recomb_rate = 0
                if genetic_dist > 0:
                    scaled_recomb_rate = physical_dist / genetic_dist
                recombination_rates[i]=scaled_recomb_rate
        except FileNotFoundError:
            print("Genetic map file {} not found, defaulting to constant recombination rate of {}".format(
                genetic_map_file, args.recombination_rate))


    output_file = args.outfile + str(c) + ".tsinf"
    if os.path.exists(output_file):
        os.unlink(output_file)
    input_hdf5 = zarr.DBMStore(output_file, open=bsddb3.btopen)
    root = zarr.group(store=input_hdf5, overwrite=True)
    tsinfer.InputFile.build(
        root, 
        genotypes=sites_by_samples,
        position=list(dat['position'].keys()),
        recombination_rate=recombination_rates)
    #sample_names=[s.encode() for s in reduced_rows]
    input_hdf5.close()
    print("Saved {} biallelic loci for {} samples into {}".format(len(dat['position']), len(reduced_rows), output_file))


"""
Then do something like
github tskit-dev / tsinfer / tsinfer / formats.py View on Github external
raise ValueError("Cannot copy to the same file")
        cls = type(self)
        other = cls.__new__(cls)
        other.path = path
        if path is None:
            # Have to work around a fairly weird bug in zarr where if we
            # try to use copy_store on an in-memory array we end up
            # overwriting the original values.
            other.data = zarr.group()
            zarr.copy_all(source=self.data, dest=other.data)
            for key, value in self.data.attrs.items():
                other.data.attrs[key] = value
        else:
            store = other._new_lmdb_store(max_file_size)
            zarr.copy_store(self.data.store, store)
            other.data = zarr.group(store)
        # Set a new UUID
        other.data.attrs["uuid"] = str(uuid.uuid4())
        other.data.attrs[FINALISED_KEY] = False
        other._mode = self.EDIT_MODE
        return other