How to use the toolz.partition_all function in toolz

To help you get started, we’ve selected a few toolz examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github microsoft / seismic-deeplearning / interpretation / deepseismic_interpretation / data.py View on Github external
valid_modes = ("train", "test", "val")
        msg = "Unknown value '{}' for argument split. " "Valid values are {{{}}}."
        msg = msg.format(split, iterable_to_str(valid_modes))
        verify_str_arg(split, "split", valid_modes, msg)

        # Set the patch and stride for the patch extractor
        _extract_patches_from = _extract_patches(patch_size, stride, self._complete_patches_only)
        num_partitions = 5
        indexes = self._data_array.shape[0]
        num_elements = math.ceil(indexes / num_partitions)
        train_indexes_list = []
        test_indexes_list = []
        val_indexes_list = []

        for partition in partition_all(num_elements, range(indexes)):  # Partition files into N partitions
            train_indexes, val_indexes, test_indexes = _split_train_val_test(partition, val_ratio, test_ratio)
            train_indexes_list.extend(train_indexes)
            test_indexes_list.extend(test_indexes)
            val_indexes_list.extend(val_indexes)

        if split == "train":
            indexes = train_indexes_list
        elif split == "val":
            indexes = val_indexes_list
        elif split == "test":
            indexes = test_indexes_list

        # Extract patches
        for index in indexes:
            img_array = self._data_array[index]
            mask_array = self._slice_mask_array[index]
github blaze / blaze / blaze / h5py.py View on Github external
def into(dset, seq, chunksize=int(2**10), **kwargs):
    assert not isinstance(dset, type)
    for chunk in partition_all(chunksize, seq):
        into(dset, into(np.ndarray, chunk, dshape=discover(dset).measure), **kwargs)
    return dset
github GeoscienceAustralia / digitalearthau / digitalearthau / vpmapper / _dask.py View on Github external
def seq_to_bags(its: Iterable[Any],
                chunk_sz: int,
                name: str = 'data'):
    """ Take a stream of data items and return a stream of dask.bag.Bag
        each bag (except last) containing ``chunk_sz`` elements in 1 partition.
    """
    for chunk in toolz.partition_all(chunk_sz, its):
        prefix = _randomize(name)
        dsk = {(prefix, 0): chunk}
        yield dask.bag.Bag(dsk, prefix, 1)
github rapidsai / dask-cudf / dask_cudf / core.py View on Github external
dsk = {
            (a, 0, i): (
                apply,
                chunk,
                [(x._name, i) if isinstance(x, _Frame) else x for x in args],
                chunk_kwargs,
            )
            for i in range(args[0].npartitions)
        }

    # Combine
    b = "{0}-combine-{1}".format(token or funcname(combine), token_key)
    k = npartitions
    depth = 0
    while k > split_every:
        for part_i, inds in enumerate(partition_all(split_every, range(k))):
            conc = (list, [(a, depth, i) for i in inds])
            dsk[(b, depth + 1, part_i)] = (
                (apply, combine, [conc], combine_kwargs)
                if combine_kwargs
                else (combine, conc)
            )
        k = part_i + 1
        a = b
        depth += 1

    # Aggregate
    b = "{0}-agg-{1}".format(token or funcname(aggregate), token_key)
    conc = (list, [(a, depth, i) for i in range(k)])
    if aggregate_kwargs:
        dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs)
    else:
github dask / dask / dask / dataframe / core.py View on Github external
raise ValueError("scalar only valid for 2 column dataframe")

    token = tokenize(df, min_periods, scalar, split_every)

    funcname = 'corr' if corr else 'cov'
    a = '{0}-chunk-{1}'.format(funcname, df._name)
    dsk = {(a, i): (cov_corr_chunk, f, corr)
           for (i, f) in enumerate(df._keys())}

    prefix = '{0}-combine-{1}-'.format(funcname, df._name)
    k = df.npartitions
    b = a
    depth = 0
    while k > split_every:
        b = prefix + str(depth)
        for part_i, inds in enumerate(partition_all(split_every, range(k))):
            dsk[(b, part_i)] = (cov_corr_combine, [(a, i) for i in inds], corr)
        k = part_i + 1
        a = b
        depth += 1

    name = '{0}-{1}'.format(funcname, token)
    dsk[(name, 0)] = (cov_corr_agg, [(a, i) for i in range(k)],
                      df.columns, min_periods, corr, scalar)
    dsk.update(df.dask)
    if scalar:
        return Scalar(dsk, name, 'f8')
    meta = make_meta([(c, 'f8') for c in df.columns], index=df.columns)
    return DataFrame(dsk, name, meta, (df.columns[0], df.columns[-1]))
github vals / umis / umis / umis.py View on Github external
def mb_filter(fastq, cores):
    ''' Filters umis with non-ACGT bases
    Expects formatted fastq files.
    '''
    filter_mb = partial(umi_filter)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, read_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(filter_mb, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read)
github blaze / odo / odo / backends / mongo.py View on Github external
def append_iterator_to_pymongo(coll, seq, columns=None, dshape=None, chunksize=1024, **kwargs):
    seq = iter(seq)
    item = next(seq)
    seq = concat([[item], seq])

    if isinstance(item, (tuple, list)):
        if not columns and dshape:
            columns = dshape.measure.names
        if not columns:
            raise ValueError("Inputs must be dictionaries. "
                "Or provide columns=[...] or dshape=DataShape(...) keyword")
        seq = (dict(zip(columns, item)) for item in seq)

    for block in partition_all(1024, seq):
        coll.insert(copy.deepcopy(block))

    return coll
github enigmampc / catalyst / catalyst / assets / assets.py View on Github external
def _lookup_most_recent_symbols(self, sids):
        symbols = {
            row.sid: {c: row[c] for c in symbol_columns}
            for row in concat(
                self.engine.execute(
                    self._select_most_recent_symbols_chunk(sid_group),
                ).fetchall()
                for sid_group in partition_all(
                    SQLITE_MAX_VARIABLE_NUMBER,
                    sids
                ),
            )
        }

        if len(symbols) != len(sids):
            raise EquitiesNotFound(
                sids=set(sids) - set(symbols),
                plural=True,
            )
        return symbols
github vals / umis / umis / umis.py View on Github external
def sb_filter(fastq, bc, cores, nedit):
    ''' Filters reads with non-matching sample barcodes
    Expects formatted fastq files.
    '''
    barcodes = set(sb.strip() for sb in bc)
    if nedit == 0:
        filter_sb = partial(exact_sample_filter2, barcodes=barcodes)
    else:
        barcodehash = MutationHash(barcodes, nedit)
        filter_sb = partial(correcting_sample_filter2, barcodehash=barcodehash)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, read_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(filter_sb, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read)