How to use the deepchem.data.DiskDataset.from_numpy function in deepchem

To help you get started, we’ve selected a few deepchem examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github ATOMconsortium / AMPL / atomsci / ddm / pipeline / splitting.py View on Github external
self.splitter = dc.splits.ScaffoldSplitter()
            train_cv_pairs = self.splitter.k_fold_split(train_cv, self.num_folds)
        else:
            # TODO: Add special handling for AVE splitter
            train_cv, test = self.splitter.train_test_split(dataset, seed=np.random.seed(123), frac_train=train_frac)
            train_cv_pairs = self.splitter.k_fold_split(train_cv, self.num_folds)
    
        train_valid_dsets = []
        train_valid_attr = []

        if self.needs_smiles():
            # Now that DeepChem splitters have done their work, replace the SMILES strings in the split 
            # dataset objects with actual compound IDs.
            for train, valid in train_cv_pairs:
                train_attr = select_attrs_by_dset_smiles(train, attr_df, smiles_col)
                train = DiskDataset.from_numpy(train.X, train.y, ids=train_attr.index.values, verbose=False)

                valid_attr = select_attrs_by_dset_smiles(valid, attr_df, smiles_col)
                valid = DiskDataset.from_numpy(valid.X, valid.y, ids=valid_attr.index.values, verbose=False)

                train_valid_dsets.append((train, valid))
                train_valid_attr.append((train_attr, valid_attr))

            test_attr = select_attrs_by_dset_smiles(test, attr_df, smiles_col)
            test = DiskDataset.from_numpy(test.X, test.y, ids=test_attr.index.values, verbose=False)
        else:
            # Otherwise just subset the ID-to-SMILES maps.
            for train, valid in train_cv_pairs:
                train_attr = select_attrs_by_dset_ids(train, attr_df)
                valid_attr = select_attrs_by_dset_ids(valid, attr_df)
                train_valid_attr.append((train_attr, valid_attr))
            train_valid_dsets = train_cv_pairs
github ATOMconsortium / AMPL / atomsci / ddm / pipeline / splitting.py View on Github external
# TODO: Add special handling for AVE splitter
            train_cv, test = self.splitter.train_test_split(dataset, seed=np.random.seed(123), frac_train=train_frac)
            train_cv_pairs = self.splitter.k_fold_split(train_cv, self.num_folds)
    
        train_valid_dsets = []
        train_valid_attr = []

        if self.needs_smiles():
            # Now that DeepChem splitters have done their work, replace the SMILES strings in the split 
            # dataset objects with actual compound IDs.
            for train, valid in train_cv_pairs:
                train_attr = select_attrs_by_dset_smiles(train, attr_df, smiles_col)
                train = DiskDataset.from_numpy(train.X, train.y, ids=train_attr.index.values, verbose=False)

                valid_attr = select_attrs_by_dset_smiles(valid, attr_df, smiles_col)
                valid = DiskDataset.from_numpy(valid.X, valid.y, ids=valid_attr.index.values, verbose=False)

                train_valid_dsets.append((train, valid))
                train_valid_attr.append((train_attr, valid_attr))

            test_attr = select_attrs_by_dset_smiles(test, attr_df, smiles_col)
            test = DiskDataset.from_numpy(test.X, test.y, ids=test_attr.index.values, verbose=False)
        else:
            # Otherwise just subset the ID-to-SMILES maps.
            for train, valid in train_cv_pairs:
                train_attr = select_attrs_by_dset_ids(train, attr_df)
                valid_attr = select_attrs_by_dset_ids(valid, attr_df)
                train_valid_attr.append((train_attr, valid_attr))
            train_valid_dsets = train_cv_pairs
            test_attr = select_attrs_by_dset_ids(test, attr_df)

        return train_valid_dsets, test, train_valid_attr, test_attr
github deepchem / deepchem / deepchem / splits / splitters.py View on Github external
def split(self, dataset, frac_split, split_dirs=None):
    """
    Method that does bulk of splitting dataset.
    """
    if split_dirs is not None:
      assert len(split_dirs) == 2
    else:
      split_dirs = [tempfile.mkdtemp(), tempfile.mkdtemp()]

    # Handle edge case where frac_split is 1
    if frac_split == 1:
      dataset_1 = DiskDataset.from_numpy(dataset.X, dataset.y, dataset.w,
                                         dataset.ids)
      dataset_2 = None
      return dataset_1, dataset_2
    X, y, w, ids = randomize_arrays((dataset.X, dataset.y, dataset.w,
                                     dataset.ids))
    if len(y.shape) == 1:
      y = np.expand_dims(y, 1)
    if len(w.shape) == 1:
      w = np.expand_dims(w, 1)
    split_indices = self.get_task_split_indices(y, w, frac_split)

    # Create weight matrices fpor two haves.
    w_1, w_2 = np.zeros_like(w), np.zeros_like(w)
    for task, split_index in enumerate(split_indices):
      # copy over up to required index for weight first_split
      w_1[:split_index, task] = w[:split_index, task]
github deepchem / deepchem / examples / binding_pockets / binding_pocket_datasets.py View on Github external
pocket_featurizer, ligand_featurizer, pdb_subdir, pdb_code)
    if features is None:
      print("Featurization failed!")
      continue
    all_features.append(features)
    all_labels.append(labels)
    ids = np.array(["%s%d" % (pdb_code, i) for i in range(len(labels))])
    all_ids.append(ids)
  time2 = time.time()
  print("TIMING: PDBBind Pocket Featurization took %0.3f s" % (time2-time1))
  X = np.vstack(all_features)
  y = np.concatenate(all_labels)
  w = np.ones_like(y)
  ids = np.concatenate(all_ids)
   
  dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids, data_dir=data_dir)
  return dataset, tasks
github deepchem / deepchem / contrib / atomicconv / acnn / core / opt_temporal.py View on Github external
complex_num_atoms = 701
max_num_neighbors = 12
neighbor_cutoff = 12.0

train_dataset = dc.data.DiskDataset(train_dir)
test_dataset = dc.data.DiskDataset(test_dir)
pdbbind_tasks = ["-logKd/Ki"]
transformers = []
#transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
#for transformer in transformers:
#  train_dataset = transformer.transform(train_dataset)
#  test_dataset = transformer.transform(test_dataset)

y_train = train_dataset.y
y_train *= -1 * 2.479 / 4.184
train_dataset = dc.data.DiskDataset.from_numpy(
    train_dataset.X,
    y_train,
    train_dataset.w,
    train_dataset.ids,
    tasks=pdbbind_tasks)

y_test = test_dataset.y
y_test *= -1 * 2.479 / 4.184
test_dataset = dc.data.DiskDataset.from_numpy(
    test_dataset.X,
    y_test,
    test_dataset.w,
    test_dataset.ids,
    tasks=pdbbind_tasks)

at = [6, 7., 8., 9., 11., 12., 15., 16., 17., 20., 25., 30., 35., 53.]
github deepchem / deepchem / examples / qm7 / qm7_datasets.py View on Github external
def load_qm7_from_mat(featurizer=None, split='stratified'):
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(current_dir, "qm7.mat")

  if not os.path.exists(dataset_file):
    os.system('wget -P ' + current_dir +
              ' http://www.quantum-machine.org/data/qm7.mat')
  dataset = scipy.io.loadmat(dataset_file)

  X = dataset['X']
  y = dataset['T']
  w = np.ones_like(y)
  dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids=None)
  print(len(dataset))

  current_dir = os.path.dirname(os.path.realpath(__file__))
  split_file = os.path.join(current_dir, "./qm7_splits.csv")

  split_indices = []
  with open(split_file, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
      row_int = (np.asarray(list(map(int, row)))).tolist()
      split_indices.append(row_int)

  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
      'indice': dc.splits.IndiceSplitter(valid_indices=split_indices[1]),
github deepchem / deepchem / contrib / atomicconv / acnn / core / opt_random.py View on Github external
complex_num_atoms = 701
max_num_neighbors = 12
neighbor_cutoff = 12.0

train_dataset = dc.data.DiskDataset(train_dir)
test_dataset = dc.data.DiskDataset(test_dir)
pdbbind_tasks = ["-logKd/Ki"]
transformers = []
#transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
#for transformer in transformers:
#  train_dataset = transformer.transform(train_dataset)
#  test_dataset = transformer.transform(test_dataset)

y_train = train_dataset.y
y_train *= -1 * 2.479 / 4.184
train_dataset = dc.data.DiskDataset.from_numpy(
    train_dataset.X,
    y_train,
    train_dataset.w,
    train_dataset.ids,
    tasks=pdbbind_tasks)

y_test = test_dataset.y
y_test *= -1 * 2.479 / 4.184
test_dataset = dc.data.DiskDataset.from_numpy(
    test_dataset.X,
    y_test,
    test_dataset.w,
    test_dataset.ids,
    tasks=pdbbind_tasks)

at = [6, 7., 8., 9., 11., 12., 15., 16., 17., 20., 25., 30., 35., 53.]
github deepchem / deepchem / contrib / atomicconv / acnn / core / tensor_graph_hyper_param_search.py View on Github external
test_dataset = dc.data.DiskDataset(test_dir)
pdbbind_tasks = ["-logKd/Ki"]
transformers = []

y_train = train_dataset.y
y_train *= -1 * 2.479 / 4.184
train_dataset = dc.data.DiskDataset.from_numpy(
    train_dataset.X,
    y_train,
    train_dataset.w,
    train_dataset.ids,
    tasks=pdbbind_tasks)

y_test = test_dataset.y
y_test *= -1 * 2.479 / 4.184
test_dataset = dc.data.DiskDataset.from_numpy(
    test_dataset.X,
    y_test,
    test_dataset.w,
    test_dataset.ids,
    tasks=pdbbind_tasks)

batch_size = 24
radial1 = [
    [1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5],
    [
        1.5, 2.5, 3.5, 4.5, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0, 9.5, 10.0,
        10.5
    ],
]
radial2 = [
    [0.0, 2.0, 4.0],
github deepchem / deepchem / deepchem / splits / splitters.py View on Github external
y = np.expand_dims(y, 1)
    if len(w.shape) == 1:
      w = np.expand_dims(w, 1)
    split_indices = self.get_task_split_indices(y, w, frac_split)

    # Create weight matrices fpor two haves.
    w_1, w_2 = np.zeros_like(w), np.zeros_like(w)
    for task, split_index in enumerate(split_indices):
      # copy over up to required index for weight first_split
      w_1[:split_index, task] = w[:split_index, task]
      w_2[split_index:, task] = w[split_index:, task]

    # check out if any rows in either w_1 or w_2 are just zeros
    rows_1 = w_1.any(axis=1)
    X_1, y_1, w_1, ids_1 = X[rows_1], y[rows_1], w_1[rows_1], ids[rows_1]
    dataset_1 = DiskDataset.from_numpy(X_1, y_1, w_1, ids_1)

    rows_2 = w_2.any(axis=1)
    X_2, y_2, w_2, ids_2 = X[rows_2], y[rows_2], w_2[rows_2], ids[rows_2]
    dataset_2 = DiskDataset.from_numpy(X_2, y_2, w_2, ids_2)

    return dataset_1, dataset_2
github deepchem / deepchem / contrib / atomicconv / acnn / refined / opt_scaffold.py View on Github external
train_dataset = dc.data.DiskDataset(train_dir)
test_dataset = dc.data.DiskDataset(test_dir)

transformers = []
# convert -logKi to dG = +RTlogKi [kJ/mol]
y_train = train_dataset.y
y_train *= -1 * 2.479 / 4.184
train_dataset = dc.data.DiskDataset.from_numpy(
    train_dataset.X,
    y_train,
    train_dataset.w,
    train_dataset.ids,
    tasks=pdbbind_tasks)
y_test = test_dataset.y
y_test *= -1 * 2.479 / 4.184
test_dataset = dc.data.DiskDataset.from_numpy(
    test_dataset.X,
    y_test,
    test_dataset.w,
    test_dataset.ids,
    tasks=pdbbind_tasks)

# Atomic convolution variables
# at = atomic numbers (atom types)
# radial basis function parameters [cutoff, mean, width]
at = [
    1., 6., 7., 8., 9., 11., 12., 15., 16., 17., 19., 20., 25., 26., 27., 28.,
    29., 30., 34., 35., 38., 48., 53., 55., 80.
]
radial = [[1.5, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
          [0.0], [0.4]]
rp = create_symmetry_parameters(radial)