Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
all_sample_ids_file = os.path.join(self.root, 'all_sample_ids.json')
with open(all_sample_ids_file, 'w') as f:
json.dump(all_sample_ids, f)
if os.path.isfile(csv_file):
os.remove(csv_file)
print('Done')
self._process_clinical_matrices()
# Create label files
for split in ['train', 'val', 'test']:
filename = os.path.join(self.root, self.filename_tasks.format(split))
data = get_asset(self.folder, '{0}.json'.format(split), dtype='json')
with open(filename, 'w') as f:
labels = sorted([key.split('|', 1) for key in data])
json.dump(labels, f)
# Clean up
for cancer in self.cancers:
filename = self.clinical_matrix_filename.format(cancer)
rawpath = os.path.join(clinical_matrices_folder, '{0}.gz'.format(filename))
if os.path.isfile(rawpath):
os.remove(rawpath)
return
filename = os.path.basename(self.download_url)
download_url(self.download_url, self.root, filename, self.tgz_md5)
tgz_filename = os.path.join(self.root, filename)
with tarfile.open(tgz_filename, 'r') as f:
f.extractall(self.root)
image_folder = os.path.join(self.root, self.image_folder)
for split in ['train', 'val', 'test']:
filename = os.path.join(self.root, self.filename.format(split))
if os.path.isfile(filename):
continue
labels = get_asset(self.folder, '{0}.json'.format(split))
labels_filename = os.path.join(self.root, self.filename_labels.format(split))
with open(labels_filename, 'w') as f:
json.dump(labels, f)
with h5py.File(filename, 'w') as f:
group = f.create_group('datasets')
dtype = h5py.special_dtype(vlen=np.uint8)
for i, label in enumerate(tqdm(labels, desc=filename)):
images = glob.glob(os.path.join(image_folder, label, '*.jpg'))
images.sort()
dataset = group.create_dataset(label, (len(images),), dtype=dtype)
for i, image in enumerate(images):
with open(image, 'rb') as f:
array = bytearray(f.read())
dataset[i] = np.asarray(array, dtype=np.uint8)
def get_task_id_splits(meta_split):
return get_asset(TCGA.folder, '{}.json'.format(meta_split), dtype='json')
for _, alphabet, character in characters:
filenames = glob.glob(os.path.join(self.root, name,
alphabet, character, '*.png'))
dataset = group.create_dataset('{0}/{1}'.format(alphabet,
character), (len(filenames), 105, 105), dtype='uint8')
for i, char_filename in enumerate(filenames):
image = Image.open(char_filename, mode='r').convert('L')
dataset[i] = ImageOps.invert(image)
shutil.rmtree(os.path.join(self.root, name))
for split in ['train', 'val', 'test']:
filename = os.path.join(self.root, self.filename_labels.format(
'vinyals_', split))
data = get_asset(self.folder, '{0}.json'.format(split), dtype='json')
with open(filename, 'w') as f:
labels = sorted([('images_{0}'.format(name), alphabet, character)
for (name, alphabets) in data.items()
for (alphabet, characters) in alphabets.items()
for character in characters])
json.dump(labels, f)
def get_cancers():
return get_asset(TCGA.folder, 'cancers.json', dtype='json')
def download(self):
if self._check_integrity():
return
super(CIFARFSClassDataset, self).download()
subfolder = os.path.join(self.root, self.subfolder)
if not os.path.exists(subfolder):
os.makedirs(subfolder)
for split in ['train', 'val', 'test']:
split_filename_labels = os.path.join(subfolder,
self.filename_labels.format(split))
if os.path.isfile(split_filename_labels):
continue
data = get_asset(self.folder, self.subfolder,
'{0}.json'.format(split), dtype='json')
with open(split_filename_labels, 'w') as f:
json.dump(data, f)