How to use the deepchem.utils.get_data_dir function in deepchem

To help you get started, we’ve selected a few deepchem examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github deepchem / deepchem / deepchem / molnet / load_function / hopv_datasets.py View on Github external
def load_hopv(featurizer='ECFP', split='index', reload=True):
  """Load HOPV datasets. Does not do train/test split"""
  # Featurize HOPV dataset
  logger.info("About to featurize HOPV dataset.")
  data_dir = deepchem.utils.get_data_dir()
  if reload:
    save_dir = os.path.join(data_dir, "hopv/" + featurizer + "/" + str(split))

  dataset_file = os.path.join(data_dir, "hopv.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/hopv.tar.gz'
    )
    deepchem.utils.untargz_file(os.path.join(data_dir, 'hopv.tar.gz'), data_dir)

  hopv_tasks = [
      'HOMO', 'LUMO', 'electrochemical_gap', 'optical_gap', 'PCE', 'V_OC',
      'J_SC', 'fill_factor'
  ]

  if reload:
github deepchem / deepchem / deepchem / molnet / load_function / qm9_datasets.py View on Github external
"""
qm9 dataset loader.
"""
from __future__ import division
from __future__ import unicode_literals

import os
import logging
import deepchem

logger = logging.getLogger(__name__)

DEFAULT_DIR = deepchem.utils.get_data_dir()
GDB9_URL = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/gdb9.tar.gz'
QM9_CSV_URL = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/qm9.csv'


def load_qm9(featurizer='CoulombMatrix',
             split='random',
             reload=True,
             move_mean=True,
             data_dir=None,
             save_dir=None,
             **kwargs):
  """Load qm9 datasets."""
  # Featurize qm9 dataset
  logger.info("About to featurize qm9 dataset.")
  qm9_tasks = [
      "mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "cv", "u0", "u298",
github deepchem / deepchem / deepchem / molnet / load_function / toxcast_datasets.py View on Github external
def load_toxcast(featurizer='ECFP', split='index', reload=True):

  data_dir = deepchem.utils.get_data_dir()
  if reload:
    save_dir = os.path.join(data_dir,
                            "toxcast/" + featurizer + "/" + str(split))

  dataset_file = os.path.join(data_dir, "toxcast_data.csv.gz")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/toxcast_data.csv.gz'
    )

  dataset = deepchem.utils.save.load_from_disk(dataset_file)
  logger.info("Columns of dataset: %s" % str(dataset.columns.values))
  logger.info("Number of examples in dataset: %s" % str(dataset.shape[0]))
  TOXCAST_tasks = dataset.columns.values[1:].tolist()

  if reload:
github deepchem / deepchem / contrib / DiabeticRetinopathy / data.py View on Github external
def load_images_DR(split='random', seed=None):
  """ Loader for DR images """
  data_dir = deepchem.utils.get_data_dir()
  images_path = os.path.join(data_dir, 'DR', 'train')
  label_path = os.path.join(data_dir, 'DR', 'trainLabels.csv')
  if not os.path.exists(images_path) or not os.path.exists(label_path):
    logger.warn("Cannot locate data, \n\
        all images(.png) should be stored in the folder: $DEEPCHEM_DATA_DIR/DR/train/,\n\
        corresponding label file should be stored as $DEEPCHEM_DATA_DIR/DR/trainLabels.csv.\n\
        Please refer to https://www.kaggle.com/c/diabetic-retinopathy-detection for data access"
               )

  image_names = os.listdir(images_path)
  raw_images = []
  for im in image_names:
    if im.endswith('.jpeg') and not im.startswith(
        'cut_') and not 'cut_' + im in image_names:
      raw_images.append(im)
  if len(raw_images) > 0:
github deepchem / deepchem / deepchem / molnet / load_function / lipo_datasets.py View on Github external
def load_lipo(featurizer='ECFP', split='index', reload=True, move_mean=True):
  """Load Lipophilicity datasets."""
  # Featurize Lipophilicity dataset
  logger.info("About to featurize Lipophilicity dataset.")
  logger.info("About to load Lipophilicity dataset.")
  data_dir = deepchem.utils.get_data_dir()
  if reload:
    if move_mean:
      dir_name = "lipo/" + featurizer + "/" + str(split)
    else:
      dir_name = "lipo/" + featurizer + "_mean_unmoved/" + str(split)
    save_dir = os.path.join(data_dir, dir_name)

  dataset_file = os.path.join(data_dir, "Lipophilicity.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/Lipophilicity.csv'
    )

  Lipo_tasks = ['exp']

  if reload:
github deepchem / deepchem / deepchem / molnet / load_function / qm8_datasets.py View on Github external
"""
qm8 dataset loader.
"""
from __future__ import division
from __future__ import unicode_literals

import os
import deepchem
import logging

logger = logging.getLogger(__name__)

DEFAULT_DIR = deepchem.utils.get_data_dir()
GDB8_URL = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/gdb8.tar.gz'
QM8_CSV_URL = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/qm8.csv'


def load_qm8(featurizer='CoulombMatrix',
             split='random',
             reload=True,
             move_mean=True,
             data_dir=None,
             save_dir=None,
             **kwargs):
  qm8_tasks = [
      "E1-CC2", "E2-CC2", "f1-CC2", "f2-CC2", "E1-PBE0", "E2-PBE0", "f1-PBE0",
      "f2-PBE0", "E1-PBE0", "E2-PBE0", "f1-PBE0", "f2-PBE0", "E1-CAM", "E2-CAM",
      "f1-CAM", "f2-CAM"
  ]
github deepchem / deepchem / deepchem / molnet / load_function / bace_datasets.py View on Github external
def load_bace_classification(featurizer='ECFP', split='random', reload=True):
  """Load bace datasets."""
  # Featurize bace dataset
  logger.info("About to featurize bace dataset.")
  data_dir = deepchem.utils.get_data_dir()
  if reload:
    save_dir = os.path.join(data_dir, "bace_c/" + featurizer + "/" + str(split))

  dataset_file = os.path.join(data_dir, "bace.csv")

  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/bace.csv'
    )

  bace_tasks = ["Class"]
  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return bace_tasks, all_dataset, transformers
github deepchem / deepchem / contrib / pubchem_dataset / create_assay_overview.py View on Github external
import pandas as pd
import os
import pickle
import array
from bisect import bisect_left
import gzip
import shutil
import deepchem
import requests

data_dir = deepchem.utils.get_data_dir()
sdf_dir = os.path.join(data_dir, "Data")


def create_cid_list(assays_to_parse):
  """Find the union of all compounds tested across one or more assays
  """
  min_assay_size = 10000
  assay_paths = list()
  cid_set = set()

  for path, dirs, filenames in os.walk(sdf_dir):
    for dir in dirs:
      # Each directory holds a range of assay results
      joined_path = os.path.join(sdf_dir,dir)
      for path, dirs, filenames in os.walk(joined_path):
        for filename in filenames:
github deepchem / deepchem / deepchem / molnet / load_function / hiv_datasets.py View on Github external
def load_hiv(featurizer='ECFP', split='index', reload=True, **kwargs):
  """Load hiv datasets. Does not do train/test split"""
  # Featurize hiv dataset
  logger.info("About to featurize hiv dataset.")
  data_dir = deepchem.utils.get_data_dir()
  if reload:
    save_dir = os.path.join(data_dir, "hiv/" + featurizer + "/" + str(split))

  dataset_file = os.path.join(data_dir, "HIV.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/HIV.csv'
    )

  hiv_tasks = ["HIV_active"]

  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return hiv_tasks, all_dataset, transformers