How to use the datalad.distribution.dataset.require_dataset function in datalad

To help you get started, we’ve selected a few datalad examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github datalad / datalad / datalad / distributed / create_sibling_gitlab.py View on Github external
name=None,
            existing='error',
            access=None,
            publish_depends=None,
            description=None,
            dryrun=False):
        path = resolve_path(assure_list(path), ds=dataset) \
            if path else None

        if project and (recursive or (path and len(path) > 1)):
            raise ValueError(
                'Providing a GitLab project name/location cannot be combined '
                'with recursive operation or multiple paths, as each dataset '
                'needs to be mapped onto its own individual project.')
        # what to operate on
        ds = require_dataset(
            dataset, check_installed=True, purpose='create GitLab sibling(s)')

        # cache for objects of gitlab sites (we could face different ones
        # in a single hierarchy, cache them to avoid duplicate initialization
        # while still being able to process each dataset individually
        siteobjs = dict()

        # which datasets to process?
        if path is None:
            for r in _proc_dataset(
                    ds, ds,
                    site, project, name, layout, existing, access,
                    dryrun, siteobjs, publish_depends, description):
                yield r
        if path or recursive:
            # also include any matching subdatasets
github datalad / datalad / datalad / core / local / run.py View on Github external
lgr.warning("No command given")
        return
    if saver:
        warnings.warn("`saver` argument is ignored "
                      "and will be removed in a future release",
                      DeprecationWarning)

    rel_pwd = rerun_info.get('pwd') if rerun_info else None
    if rel_pwd and dataset:
        # recording is relative to the dataset
        pwd = normpath(opj(dataset.path, rel_pwd))
        rel_pwd = relpath(pwd, dataset.path)
    else:
        pwd, rel_pwd = get_command_pwds(dataset)

    ds = require_dataset(
        dataset, check_installed=True,
        purpose='tracking outcomes of a command')
    ds_path = ds.path

    lgr.debug('tracking command output underneath %s', ds)

    if not (rerun_info or inject):  # Rerun already takes care of this.
        # For explicit=True, we probably want to check whether any inputs have
        # modifications. However, we can't just do is_dirty(..., path=inputs)
        # because we need to consider subdatasets and untracked files.
        # MIH: is_dirty() is gone, but status() can do all of the above!
        if not explicit and ds.repo.dirty:
            yield get_status_dict(
                'run',
                ds=ds,
                status='impossible',
github datalad / datalad / datalad / metadata / aggregate.py View on Github external
def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            update_mode='target',
            incremental=False,
            force_extraction=False,
            save=True):
        refds_path = Interface.get_refds_path(dataset)

        # it really doesn't work without a dataset
        ds = require_dataset(
            dataset, check_installed=True, purpose='metadata aggregation')
        path = assure_list(path)
        if not path:
            # then current/reference dataset is "aggregated"
            # We should not add ds.path always since then --recursive would
            # also recurse current even if paths are given
            path.append(ds.path)

        agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations(ds)
        agginfo_db = load_ds_aggregate_db(ds, abspath=True)

        to_save = []
        to_aggregate = set()
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
github datalad / datalad / datalad / plugin / no_annex.py View on Github external
def __call__(dataset, pattern, ref_dir='.', makedirs=False):
        # could be extended to accept actual largefile expressions
        from os.path import join as opj
        from os.path import isabs
        from os.path import exists
        from os import makedirs as makedirsfx
        from datalad.distribution.dataset import require_dataset
        from datalad.support.annexrepo import AnnexRepo
        from datalad.utils import assure_list

        pattern = assure_list(pattern)
        ds = require_dataset(dataset, check_installed=True,
                             purpose='no_annex configuration')

        res_kwargs = dict(
            path=ds.path,
            type='dataset',
            action='no_annex',
        )

        # all the ways we refused to cooperate
        if not isinstance(ds.repo, AnnexRepo):
            yield dict(
                res_kwargs,
                status='notneeded',
                message='dataset has no annex')
            return
        if any(isabs(p) for p in pattern):
github datalad / datalad / 3rd / datalad-revolution / datalad_revolution / dataset.py View on Github external
def require_rev_dataset(dataset, check_installed=True, purpose=None):
    return RevolutionDataset(_require_dataset(
        dataset,
        check_installed,
        purpose).path)
github datalad / datalad / datalad / distribution / get.py View on Github external
recursion_limit=None,
            get_data=True,
            description=None,
            reckless=False,
            jobs='auto',
    ):
        refds_path = Interface.get_refds_path(dataset)
        if not (dataset or path):
            raise InsufficientArgumentsError(
                "Neither dataset nor target path(s) provided")
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path

        # we have to have a single dataset to operate on
        refds = require_dataset(
            dataset, check_installed=True, purpose='get content')

        content_by_ds = {}
        # use subdatasets() to discover any relevant content that is not
        # already present in the root dataset (refds)
        for sdsres in Subdatasets.__call__(
                contains=path,
                # maintain path argument semantics and pass in dataset arg
                # as is
                dataset=dataset,
                # always come from the top to get sensible generator behavior
                bottomup=False,
                # when paths are given, they will constrain the recursion
                # automatically, and we need to enable recursion so we can
                # location path in subdatasets several levels down
                recursive=True if path else recursive,
github datalad / datalad / datalad / plugin / export_archive.py View on Github external
import os
        import tarfile
        import zipfile
        from mock import patch
        from os.path import join as opj, dirname, normpath, isabs
        import os.path as op

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import file_basename
        from datalad.support.annexrepo import AnnexRepo
        from datalad.dochelpers import exc_str

        import logging
        lgr = logging.getLogger('datalad.plugin.export_archive')

        dataset = require_dataset(dataset, check_installed=True,
                                  purpose='export archive')

        repo = dataset.repo
        committed_date = repo.get_commit_date()

        # could be used later on to filter files by some criterion
        def _filter_tarinfo(ti):
            # Reset the date to match the one of the last commit, not from the
            # filesystem since git doesn't track those at all
            # TODO: use the date of the last commit when any particular
            # file was changed -- would be the most kosher yoh thinks to the
            # degree of our abilities
            ti.mtime = committed_date
            return ti
        tar_args = dict(recursive=False, filter=_filter_tarinfo)
github datalad / datalad / datalad / core / local / create.py View on Github external
else refds_path

        # we know that we need to create a dataset at `path`
        assert(path is not None)

        # assure cfg_proc is a list (relevant if used via Python API)
        cfg_proc = assure_list(cfg_proc)

        # prep for yield
        res = dict(action='create', path=str(path),
                   logger=lgr, type='dataset',
                   refds=refds_path)

        refds = None
        if refds_path and refds_path != str(path):
            refds = require_dataset(
                refds_path, check_installed=True,
                purpose='creating a subdataset')

            path_inrefds = path_under_rev_dataset(refds, path)
            if path_inrefds is None:
                yield dict(
                    res,
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s",
                        ds, str(path)),
                )
                return

        # try to locate an immediate parent dataset
github datalad / datalad / datalad / metadata / metadata.py View on Github external
path=None,
            dataset=None,
            get_aggregates=False,
            reporton='all',
            recursive=False):
        # prep results
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='metadata', logger=lgr)
        if refds_path:
            res_kwargs['refds'] = refds_path

        if get_aggregates:
            # yield all datasets for which we have aggregated metadata as results
            # the get actual dataset results, so we can turn them into dataset
            # instances using generic top-level code if desired
            ds = require_dataset(
                refds_path,
                check_installed=True,
                purpose='aggregate metadata query')
            agginfos = load_ds_aggregate_db(
                ds,
                version=str(aggregate_layout_version),
                abspath=True
            )
            if not agginfos:
                # if there has ever been an aggregation run, this file would
                # exist, hence there has not been and we need to tell this
                # to people
                yield get_status_dict(
                    ds=ds,
                    status='impossible',
                    action='metadata',
github datalad / datalad / datalad / distribution / install.py View on Github external
common_kwargs = dict(
            get_data=get_data,
            recursive=recursive,
            recursion_limit=recursion_limit,
            # git_opts=git_opts,
            # annex_opts=annex_opts,
            reckless=reckless,
            jobs=jobs,
        )

        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        ds = None
        if dataset is not None:
            ds = require_dataset(dataset, check_installed=True,
                                 purpose='installation')
            common_kwargs['dataset'] = dataset
        # pre-compute for results below
        refds_path = Interface.get_refds_path(ds)

        # switch into the two scenarios without --source:
        # 1. list of URLs
        # 2. list of (sub)dataset content
        if source is None:
            # we need to collect URLs and paths
            to_install = []
            to_get = []
            # TODO: this approach is problematic, it disrupts the order of input args.
            # consequently results will be returned in an unexpected order when a
            # mixture of source URL and paths is given. Reordering is only possible when
            # everything in here is fully processed before any results can be yielded.