How to use the yellowbrick.datasets.path.find_dataset_path function in yellowbrick

To help you get started, we’ve selected a few yellowbrick examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github DistrictDataLabs / yellowbrick / tests / test_datasets / test_loaders.py View on Github external
def assert_valid_dataset(data, name):
    __tracebackhide__ = True
    assert isinstance(data, Dataset), "not a Dataset object"
    assert name in DATASETS, "dataset not in manifest"

    assert dataset_exists(name), "dataset directory does not exist"
    assert dataset_archive(name, DATASETS[name]["signature"]), "dataset archive does not match signature"
    assert find_dataset_path(name, ext=".csv.gz", raises=False) is not None, "no .csv.tgz in dataset"
    assert find_dataset_path(name, ext=".npz", raises=False) is not None, "no .npz in dataset"

    n_files = len(data.contents())
    assert n_files == 4 or n_files == 5, "not enough files in dataset"
    assert len(data.README) > 0, "readme contains no data"
    assert len(data.meta) > 0, "metadata is empty"

    if n_files == 5:
        assert len(data.citation) > 0, "citation.bib is empty"

    assert "features" in data.meta, "no features in metadata"
    assert "target" in data.meta, "no target in metadata"
github DistrictDataLabs / yellowbrick / yellowbrick / datasets / base.py View on Github external
"""
        Returns the entire dataset as a single pandas DataFrame.

        Returns
        -------
        df : DataFrame with shape (n_instances, n_columns)
            A pandas DataFrame containing the complete original data table
            including all targets (specified by the meta data) and all
            features (including those that might have been filtered out).
        """
        if pd is None:
            raise DatasetsError(
                "pandas is required to load DataFrame, it can be installed with pip"
            )

        path = find_dataset_path(self.name, ext=".csv.gz", data_home=self.data_home)
        return pd.read_csv(path, compression="gzip")
github DistrictDataLabs / yellowbrick / yellowbrick / datasets / base.py View on Github external
def contents(self):
        """
        Contents returns a list of the files in the data directory.
        """
        data = find_dataset_path(self.name, data_home=self.data_home, ext=None)
        return os.listdir(data)
github DistrictDataLabs / yellowbrick / yellowbrick / datasets / base.py View on Github external
def root(self):
        """
        Discovers and caches the root directory of the corpus.
        """
        return find_dataset_path(self.name, data_home=self.data_home, ext=None)
github DistrictDataLabs / yellowbrick / yellowbrick / datasets / base.py View on Github external
def to_numpy(self):
        """
        Returns the dataset as two numpy arrays: X and y.

        Returns
        -------
        X : array-like with shape (n_instances, n_features)
            A numpy array describing the instance features.

        y : array-like with shape (n_instances,)
            A numpy array describing the target vector.
        """
        path = find_dataset_path(self.name, ext=".npz", data_home=self.data_home)
        with np.load(path, allow_pickle=False) as npf:
            if "X" not in npf or "y" not in npf:
                raise DatasetsError(
                    (
                        "the downloaded dataset was improperly packaged without numpy "
                        "arrays - please report this bug to the Yellowbrick maintainers!"
                    )
                )

            # TODO: How to handle the case where y is None?
            return npf["X"], npf["y"]
github DistrictDataLabs / yellowbrick / yellowbrick / datasets / base.py View on Github external
def meta(self):
        """
        Returns the contents of the meta.json file that describes important
        attributes about the dataset and modifies the behavior of the loader.
        """
        path = find_dataset_path(
            self.name, data_home=self.data_home, fname="meta.json", raises=False
        )
        if path is None:
            return None

        with open(path, "r") as f:
            return json.load(f)
github DistrictDataLabs / yellowbrick / yellowbrick / datasets / base.py View on Github external
def README(self):
        """
        Returns the contents of the README.md file that describes the dataset
        in detail and contains attribution information.
        """
        path = find_dataset_path(self.name, data_home=self.data_home, fname="README.md")
        with open(path, "r") as f:
            return f.read()