How to use the deepchem.data.DiskDataset function in deepchem

To help you get started, we’ve selected a few deepchem examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github deepchem / deepchem / examples / benchmark_curve.py View on Github external
metric = [dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)]
      elif dataset in [
          'bace_r', 'chembl', 'clearance', 'delaney', 'hopv', 'kaggle', 'lipo',
          'nci', 'pdbbind', 'ppb', 'qm7', 'qm7b', 'qm8', 'qm9', 'sampl'
      ]:
        mode = 'regression'
        metric = [dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)]

      pair = (dataset, model)
      if pair in CheckFeaturizer:
        featurizer = CheckFeaturizer[pair][0]
        n_features = CheckFeaturizer[pair][1]

      tasks, all_dataset, transformers = load_dataset(
          dataset, featurizer, split='index')
      all_dataset = dc.data.DiskDataset.merge(all_dataset)
      for frac_train in frac_trains:
        splitters = {
            'index': dc.splits.IndexSplitter(),
            'random': dc.splits.RandomSplitter(),
            'scaffold': dc.splits.ScaffoldSplitter(),
            'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=0)
        }
        splitter = splitters[split]
        np.random.seed(seed)
        train, valid, test = splitter.train_valid_test_split(
            all_dataset,
            frac_train=frac_train,
            frac_valid=1 - frac_train,
            frac_test=0.)
        test = valid
        if mode == 'classification':
github deepchem / deepchem / deepchem / molnet / load_function / uv_datasets.py View on Github external
data_dir = deepchem.utils.get_data_dir()
  data_dir = os.path.join(data_dir, "UV")

  if not os.path.exists(data_dir):
    os.mkdir(data_dir)

  train_dir = os.path.join(data_dir, "train_dir")
  valid_dir = os.path.join(data_dir, "valid_dir")
  test_dir = os.path.join(data_dir, "test_dir")

  if (os.path.exists(train_dir) and os.path.exists(valid_dir) and
      os.path.exists(test_dir)):

    logger.info("Reloading existing datasets")
    train_dataset = deepchem.data.DiskDataset(train_dir)
    valid_dataset = deepchem.data.DiskDataset(valid_dir)
    test_dataset = deepchem.data.DiskDataset(test_dir)

  else:
    logger.info("Featurizing datasets")
    train_dataset, valid_dataset, test_dataset = \
    gen_uv(UV_tasks=UV_tasks, data_dir=data_dir, train_dir=train_dir,
           valid_dir=valid_dir, test_dir=test_dir, shard_size=shard_size)

  transformers = get_transformers(train_dataset)

  return UV_tasks, (train_dataset, valid_dataset, test_dataset), transformers
github deepchem / deepchem / deepchem / molnet / load_function / pdbbind_datasets.py View on Github external
deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/featurized_datasets/full_grid.tar.gz'
    )
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/featurized_datasets/refined_grid.tar.gz'
    )
    if not os.path.exists(pdbbind_dir):
      os.system('mkdir ' + pdbbind_dir)
    deepchem.utils.untargz_file(
        os.path.join(data_dir, 'core_grid.tar.gz'), pdbbind_dir)
    deepchem.utils.untargz_file(
        os.path.join(data_dir, 'full_grid.tar.gz'), pdbbind_dir)
    deepchem.utils.untargz_file(
        os.path.join(data_dir, 'refined_grid.tar.gz'), pdbbind_dir)

  return deepchem.data.DiskDataset(dataset_dir), tasks
github deepchem / deepchem / examples / uv / UV_datasets.py View on Github external
UV_tasks = (['logTIC'] +
                  ['w__%d' % i for i in range(210, 401)])

  current_dir = os.path.dirname(os.path.realpath(__file__))
  raw_train_dir = os.path.join(current_dir, "raw_train_dir")
  train_dir = os.path.join(current_dir, "train_dir") 
  valid_dir = os.path.join(current_dir, "valid_dir") 
  test_dir = os.path.join(current_dir, "test_dir") 

  if (os.path.exists(raw_train_dir) and
      os.path.exists(train_dir) and
      os.path.exists(valid_dir) and
      os.path.exists(test_dir)):
    print("Reloading existing datasets")
    raw_train_dataset = dc.data.DiskDataset(raw_train_dir)
    train_dataset = dc.data.DiskDataset(train_dir)
    valid_dataset = dc.data.DiskDataset(valid_dir)
    test_dataset = dc.data.DiskDataset(test_dir)
  else:
    print("Featurizing datasets")
    (raw_train_dataset, train_dataset, valid_dataset, test_dataset) = \
      gen_uv(UV_tasks, raw_train_dir, train_dir, valid_dir, test_dir,
                  shard_size=shard_size)

  transformers = get_transformers(raw_train_dataset)
  return UV_tasks, (train_dataset, valid_dataset, test_dataset), transformers
github deepchem / deepchem / contrib / atomicconv / acnn / core / tensor_graph_hyper_param_search.py View on Github external
import numpy as np
import tensorflow as tf
import itertools
import time

seed = 123
np.random.seed(seed)
tf.set_random_seed(seed)

base_dir = os.getcwd()
data_dir = os.path.join(base_dir, "datasets")
train_dir = os.path.join(data_dir, "scaffold_train")
test_dir = os.path.join(data_dir, "scaffold_test")

train_dataset = dc.data.DiskDataset(train_dir)
test_dataset = dc.data.DiskDataset(test_dir)
pdbbind_tasks = ["-logKd/Ki"]
transformers = []

y_train = train_dataset.y
y_train *= -1 * 2.479 / 4.184
train_dataset = dc.data.DiskDataset.from_numpy(
    train_dataset.X,
    y_train,
    train_dataset.w,
    train_dataset.ids,
    tasks=pdbbind_tasks)

y_test = test_dataset.y
y_test *= -1 * 2.479 / 4.184
test_dataset = dc.data.DiskDataset.from_numpy(
    test_dataset.X,
github deepchem / deepchem / contrib / atomicconv / acnn / core / opt_stratified.py View on Github external
#for transformer in transformers:
#  train_dataset = transformer.transform(train_dataset)
#  test_dataset = transformer.transform(test_dataset)

y_train = train_dataset.y
y_train *= -1 * 2.479 / 4.184
train_dataset = dc.data.DiskDataset.from_numpy(
    train_dataset.X,
    y_train,
    train_dataset.w,
    train_dataset.ids,
    tasks=pdbbind_tasks)

y_test = test_dataset.y
y_test *= -1 * 2.479 / 4.184
test_dataset = dc.data.DiskDataset.from_numpy(
    test_dataset.X,
    y_test,
    test_dataset.w,
    test_dataset.ids,
    tasks=pdbbind_tasks)

at = [6, 7., 8., 9., 11., 12., 15., 16., 17., 20., 25., 30., 35., 53.]
radial = [[
    1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5,
    9.0, 9.5, 10.0, 10.5, 11.0, 11.5, 12.0
], [0.0, 4.0, 8.0], [0.4]]
#radial = [[12.0], [0.0, 4.0, 8.0], [0.4]]
rp = create_symmetry_parameters(radial)
layer_sizes = [32, 32, 16]
weight_init_stddevs = [
    1 / np.sqrt(layer_sizes[0]), 1 / np.sqrt(layer_sizes[1]),
github deepchem / deepchem / deepchem / molnet / load_function / kaggle_datasets.py View on Github external
]
  data_dir = deepchem.utils.get_data_dir()

  data_dir = os.path.join(data_dir, "kaggle")
  if not os.path.exists(data_dir):
    os.mkdir(data_dir)
  train_dir = os.path.join(data_dir, "train_dir")
  valid_dir = os.path.join(data_dir, "valid_dir")
  test_dir = os.path.join(data_dir, "test_dir")

  if (os.path.exists(train_dir) and os.path.exists(valid_dir) and
      os.path.exists(test_dir)):
    logger.info("Reloading existing datasets")
    train_dataset = deepchem.data.DiskDataset(train_dir)
    valid_dataset = deepchem.data.DiskDataset(valid_dir)
    test_dataset = deepchem.data.DiskDataset(test_dir)
  else:
    logger.info("Featurizing datasets")
    train_dataset, valid_dataset, test_dataset = \
      gen_kaggle(KAGGLE_tasks, train_dir, valid_dir, test_dir, data_dir,
                  shard_size=shard_size)

  transformers = get_transformers(train_dataset)
  return KAGGLE_tasks, (train_dataset, valid_dataset,
                        test_dataset), transformers
github deepchem / deepchem / contrib / atomicconv / acnn / refined / opt_random.py View on Github external
data_dir = os.path.join(base_dir, "datasets")
train_dir = os.path.join(data_dir, "random_train")
test_dir = os.path.join(data_dir, "random_test")
model_dir = os.path.join(base_dir, "random_model")

# Model constants
frag1_num_atoms = 153
frag2_num_atoms = 1119
complex_num_atoms = 1254
max_num_neighbors = 12
neighbor_cutoff = 12.0

# Load and transform datasets
pdbbind_tasks = ["-logKd/Ki"]
train_dataset = dc.data.DiskDataset(train_dir)
test_dataset = dc.data.DiskDataset(test_dir)

transformers = []
# convert -logKi to dG = +RTlogKi [kJ/mol]
y_train = train_dataset.y
y_train *= -1 * 2.479 / 4.184
train_dataset = dc.data.DiskDataset.from_numpy(
    train_dataset.X,
    y_train,
    train_dataset.w,
    train_dataset.ids,
    tasks=pdbbind_tasks)
y_test = test_dataset.y
y_test *= -1 * 2.479 / 4.184
test_dataset = dc.data.DiskDataset.from_numpy(
    test_dataset.X,
    y_test,
github deepchem / deepchem / deepchem / feat / featurize.py View on Github external
def featurize_map_function(args):
  ############################################################## TIMING
  time1 = time.time()
  ############################################################## TIMING
  ((loader, shard_size, input_type, data_dir), (shard_num, raw_df_shard)) = args
  log("Loading shard %d of size %s from file." % (shard_num+1, str(shard_size)),
      loader.verbosity)
  log("About to featurize shard.", loader.verbosity)
  write_fn = partial(
      DiskDataset.write_dataframe, data_dir=data_dir,
      featurizer=loader.featurizer, tasks=loader.tasks,
      mol_id_field=loader.id_field, verbosity=loader.verbosity)
  ############################################################## TIMING
  shard_time1 = time.time()
  ############################################################## TIMING
  metadata_row = loader._featurize_shard(
      raw_df_shard, write_fn, shard_num, input_type)
  ############################################################## TIMING
  shard_time2 = time.time()
  log("TIMING: shard featurization took %0.3f s" % (shard_time2-shard_time1),
      loader.verbosity)
  ############################################################## TIMING
  ############################################################## TIMING
  time2 = time.time()
  log("TIMING: featurization map function took %0.3f s" % (time2-time1),
      loader.verbosity)