Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def download_data(dataset: Dict[str, str]) -> io.BytesIO:
from .. import session
f = session.get(dataset["url"], stream=True)
total = int(f.headers["Content-Length"])
buffer = io.BytesIO()
for chunk in tqdm(
f.iter_content(1024),
total=total // 1024 + 1 if total % 1024 > 0 else 0,
desc="download",
):
buffer.write(chunk)
buffer.seek(0)
compute_md5 = md5(buffer.getbuffer()).hexdigest()
if compute_md5 != dataset["md5sum"]:
raise RuntimeError(
f"Error in MD5 check: {compute_md5} instead of {dataset['md5sum']}"
)
return buffer
show_progress: bool
Show progress bar
"""
if os.path.exists(file_path) and os.path.getsize(file_path):
return
tmp_file_path = file_path + ".part"
first_byte = os.path.getsize(tmp_file_path) if os.path.exists(tmp_file_path) else 0
file_mode = "ab" if first_byte else "wb"
file_size = -1
try:
file_size = int(requests.head(url).headers["Content-length"])
headers = {"Range": "bytes=%s-" % first_byte}
r = requests.get(url, headers=headers, stream=True)
if show_progress:
desc = "Downloading {}".format(url.split("/")[-1])
pbar = tqdm(
total=file_size,
initial=first_byte,
unit="B",
unit_scale=True,
desc=desc,
)
with open(tmp_file_path, file_mode) as f:
for chunk in r.iter_content(chunk_size=block_size):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
if show_progress:
pbar.update(block_size)
if show_progress:
pbar.close()
except IOError as e:
sys.stderr.write("IO Error - {}\n".format(e))
del slurm_config['experiments_per_job']
start_slurm_job(collection, exp_array, unobserved, post_mortem,
name=job_name, output_dir_path=output_dir_path, **slurm_config)
else:
login_node_name = 'fs'
if login_node_name in os.uname()[1]:
logging.error("Refusing to run a compute experiment on a login node. "
"Please use Slurm or a compute node.")
sys.exit(1)
[get_output_dir_path(exp) for exp in exps_list] # Check if output dir exists
logging.info(f'Starting local worker thread that will run up to {nexps} experiment{s_if(nexps)}, '
f'until no queued experiments remain.')
if not unobserved:
collection.update_many({'_id': {'$in': [e['_id'] for e in exps_list]}}, {"$set": {"status": "PENDING"}})
num_exceptions = 0
tq = tqdm(enumerate(exps_list))
for i_exp, exp in tq:
if output_to_file:
output_dir_path = get_output_dir_path(exp)
else:
output_dir_path = None
success = start_local_job(collection, exp, unobserved, post_mortem, output_dir_path)
if success is False:
num_exceptions += 1
tq.set_postfix(failed=f"{num_exceptions}/{i_exp} experiments")
X_train, X_test = to_array(X_train, X_test)
# how many features to mask
assert X_train.shape[1] == X_test.shape[1]
# this is the model we will retrain many times
model_masked = model_generator()
# keep nkeep top features and re-train the model for each test explanation
X_train_tmp = np.zeros(X_train.shape)
X_test_tmp = np.zeros(X_test.shape)
yp_masked_test = np.zeros(y_test.shape)
tie_breaking_noise = const_rand(X_train.shape[1]) * 1e-6
last_nkeep = _keep_cache.get("nkeep", None)
last_yp_masked_test = _keep_cache.get("yp_masked_test", None)
for i in tqdm(range(len(y_test)), "Retraining for the 'keep' metric"):
if cache_match and last_nkeep[i] == nkeep[i]:
yp_masked_test[i] = last_yp_masked_test[i]
elif nkeep[i] == attr_test.shape[1]:
yp_masked_test[i] = trained_model.predict(X_test[i:i+1])[0]
else:
# mask out the most important features for this test instance
X_train_tmp[:] = X_train
X_test_tmp[:] = X_test
ordering = np.argsort(-attr_test[i,:] + tie_breaking_noise)
X_train_tmp[:,ordering[nkeep[i]:]] = X_train[:,ordering[nkeep[i]:]].mean()
X_test_tmp[i,ordering[nkeep[i]:]] = X_train[:,ordering[nkeep[i]:]].mean()
# retrain the model and make a prediction
model_masked.fit(X_train_tmp, y_train)
yp_masked_test[i] = model_masked.predict(X_test_tmp[i:i+1])[0]
)
else:
cumul = []
with ProcessPoolExecutor(max_workers=max_workers) as executor:
iterator = self.wrapped_t.iterate(**self.iterate_kw)
if len(self.tqdm_kw):
iterator = tqdm(iterator, **self.tqdm_kw)
tasks = {
executor.submit(
apply, self.stacked_ops, idx, flight
): flight
for idx, flight in enumerate(iterator)
}
tasks_completed = as_completed(tasks)
if desc is not None:
tasks_completed = tqdm(
tasks_completed,
total=len(tasks),
desc=desc,
leave=False,
)
for future in tasks_completed:
cumul.append(future.result())
# return Traffic.from_flights
return self.wrapped_t.__class__.from_flights(
[flight for flight in cumul if flight is not None]
)
X_train, X_test = to_array(X_train, X_test)
# how many features to mask
assert X_train.shape[1] == X_test.shape[1]
# this is the model we will retrain many times
model_masked = model_generator()
# mask nmask top features and re-train the model for each test explanation
X_train_tmp = np.zeros(X_train.shape)
X_test_tmp = np.zeros(X_test.shape)
yp_masked_test = np.zeros(y_test.shape)
tie_breaking_noise = const_rand(X_train.shape[1]) * 1e-6
last_nmask = _remove_cache.get("nmask", None)
last_yp_masked_test = _remove_cache.get("yp_masked_test", None)
for i in tqdm(range(len(y_test)), "Retraining for the 'remove' metric"):
if cache_match and last_nmask[i] == nmask[i]:
yp_masked_test[i] = last_yp_masked_test[i]
elif nmask[i] == 0:
yp_masked_test[i] = trained_model.predict(X_test[i:i+1])[0]
else:
# mask out the most important features for this test instance
X_train_tmp[:] = X_train
X_test_tmp[:] = X_test
ordering = np.argsort(-attr_test[i,:] + tie_breaking_noise)
X_train_tmp[:,ordering[:nmask[i]]] = X_train[:,ordering[:nmask[i]]].mean()
X_test_tmp[i,ordering[:nmask[i]]] = X_train[:,ordering[:nmask[i]]].mean()
# retrain the model and make a prediction
model_masked.fit(X_train_tmp, y_train)
yp_masked_test[i] = model_masked.predict(X_test_tmp[i:i+1])[0]
# Get dataloader of current mode
dataloader = self._loaders[mode]
dataloader_iter = iter(dataloader)
num_batch = len(dataloader)
# Initialize loss of steps and epoch
_steps_loss = 0.0
_epoch_loss = 0.0
# Generate progress bar of this epoch from dataloader
if self.has_max_num_iterations:
num_batch_epochs = min(self.max_num_iterations, num_batch)
# TODO: add metrics logging to tqdm
# mdesc = ", "+ ", ".join([m.capitalize() + ": ?" for m in self.metrics])
desc = "Current Mode: %s, Step Loss: ?" % mode
pbar = tqdm(range(num_batch_epochs), desc=desc)
# Iterate through the progress bar
for i in pbar:
# Get next batch from dataloader
batch_values = next(dataloader_iter)
# Calculate prediction and loss of the batch
prediction, loss = self._iterate(batch_values, backward=backward)
# Convert loss from torch.Tensor to float
loss_value = loss.cpu().item()
_steps_loss += loss_value
_epoch_loss += loss_value
# Calculate metrics
# if self.has_metrics:
import gridfs
if all_collections:
config = get_mongodb_config()
db = get_database(**config)
collection_names = db.list_collection_names()
else:
collection = get_collection(db_collection_name)
db = collection.database
collection_names = [collection.name]
collection_names = set(collection_names)
collection_blacklist = {'fs.chunks', 'fs.files'}
collection_names = collection_names - collection_blacklist
fs = gridfs.GridFS(db)
referenced_files = set()
for collection_name in tqdm(collection_names):
collection = db[collection_name]
experiments = list(collection.find({}, {'artifacts': 1, 'experiment.sources': 1, 'source_files': 1}))
for exp in experiments:
if 'artifacts' in exp:
referenced_files.update({x[1] for x in exp['artifacts']})
if 'experiment' in exp and 'sources' in exp['experiment']:
referenced_files.update({x[1] for x in exp['experiment']['sources']})
if 'source_files' in exp:
referenced_files.update({x[1] for x in exp['source_files']})
all_files_in_db = list(db['fs.files'].find({}, {'_id': 1, 'filename': 1, 'metadata': 1}))
filtered_file_ids = set()
for file in all_files_in_db:
if 'filename' in file:
filename = file['filename']
file_collection = None