Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for exp_array in exp_arrays:
job_name = get_exp_name(exp_array[0][0], collection.name)
output_dir_path = get_output_dir_path(exp_array[0][0])
slurm_config = exp_array[0][0]['slurm']
del slurm_config['experiments_per_job']
start_slurm_job(collection, exp_array, unobserved, post_mortem,
name=job_name, output_dir_path=output_dir_path, **slurm_config)
else:
login_node_name = 'fs'
if login_node_name in os.uname()[1]:
logging.error("Refusing to run a compute experiment on a login node. "
"Please use Slurm or a compute node.")
sys.exit(1)
[get_output_dir_path(exp) for exp in exps_list] # Check if output dir exists
logging.info(f'Starting local worker thread that will run up to {nexps} experiment{s_if(nexps)}, '
f'until no queued experiments remain.')
if not unobserved:
collection.update_many({'_id': {'$in': [e['_id'] for e in exps_list]}}, {"$set": {"status": "PENDING"}})
num_exceptions = 0
tq = tqdm(enumerate(exps_list))
for i_exp, exp in tq:
if output_to_file:
output_dir_path = get_output_dir_path(exp)
else:
output_dir_path = None
success = start_local_job(collection, exp, unobserved, post_mortem, output_dir_path)
if success is False:
num_exceptions += 1
tq.set_postfix(failed=f"{num_exceptions}/{i_exp} experiments")
collection = get_collection(db_collection_name)
if sacred_id is None:
if len({'PENDING', 'RUNNING', 'KILLED'} & set(filter_states)) > 0:
detect_killed(db_collection_name, print_detected=False)
filter_dict = build_filter_dict(filter_states, batch_id, filter_dict)
ndelete = collection.count_documents(filter_dict)
batch_ids = collection.find(filter_dict, {'batch_id'})
batch_ids_in_del = set([x['batch_id'] for x in batch_ids])
if ndelete >= 10:
if input(f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection. "
f"Are you sure? (y/n) ").lower() != "y":
exit()
else:
logging.info(f"Deleting {ndelete} configuration{s_if(ndelete)} from database collection.")
collection.delete_many(filter_dict)
else:
exp = collection.find_one({'_id': sacred_id})
if exp is None:
logging.error(f"No experiment found with ID {sacred_id}.")
sys.exit(1)
else:
logging.info(f"Deleting experiment with ID {sacred_id}.")
batch_ids_in_del = set([exp['batch_id']])
collection.delete_one({'_id': sacred_id})
if len(batch_ids_in_del) > 0:
# clean up the uploaded sources if no experiments of a batch remain
delete_orphaned_sources(collection, batch_ids_in_del)
if 'conda_environment' in exp['seml']:
configs.append((exe, exp['seml']['conda_environment'], config))
else:
configs.append((exe, None, config))
return configs
elif slurm:
if not output_to_file:
logging.error("Output cannot be written to stdout in Slurm mode. "
"Remove the '--output-to-console' argument.")
sys.exit(1)
exp_chunks = chunk_list(exps_list)
exp_arrays = batch_chunks(exp_chunks)
njobs = len(exp_chunks)
narrays = len(exp_arrays)
logging.info(f"Starting {nexps} experiment{s_if(nexps)} in "
f"{njobs} Slurm job{s_if(njobs)} in {narrays} Slurm job array{s_if(narrays)}.")
for exp_array in exp_arrays:
job_name = get_exp_name(exp_array[0][0], collection.name)
output_dir_path = get_output_dir_path(exp_array[0][0])
slurm_config = exp_array[0][0]['slurm']
del slurm_config['experiments_per_job']
start_slurm_job(collection, exp_array, unobserved, post_mortem,
name=job_name, output_dir_path=output_dir_path, **slurm_config)
else:
login_node_name = 'fs'
if login_node_name in os.uname()[1]:
logging.error("Refusing to run a compute experiment on a login node. "
"Please use Slurm or a compute node.")
sys.exit(1)
[get_output_dir_path(exp) for exp in exps_list] # Check if output dir exists
def report_status(db_collection_name):
detect_killed(db_collection_name, print_detected=False)
collection = get_collection(db_collection_name)
queued = collection.count_documents({'status': 'QUEUED'})
pending = collection.count_documents({'status': 'PENDING'})
failed = collection.count_documents({'status': 'FAILED'})
killed = collection.count_documents({'status': 'KILLED'})
interrupted = collection.count_documents({'status': 'INTERRUPTED'})
running = collection.count_documents({'status': 'RUNNING'})
completed = collection.count_documents({'status': 'COMPLETED'})
title = f"********** Report for database collection '{db_collection_name}' **********"
logging.info(title)
logging.info(f"* - {queued:3d} queued experiment{s_if(queued)}")
logging.info(f"* - {pending:3d} pending experiment{s_if(pending)}")
logging.info(f"* - {running:3d} running experiment{s_if(running)}")
logging.info(f"* - {completed:3d} completed experiment{s_if(completed)}")
logging.info(f"* - {interrupted:3d} interrupted experiment{s_if(interrupted)}")
logging.info(f"* - {failed:3d} failed experiment{s_if(failed)}")
logging.info(f"* - {killed:3d} killed experiment{s_if(killed)}")
logging.info("*" * len(title))
def report_status(db_collection_name):
detect_killed(db_collection_name, print_detected=False)
collection = get_collection(db_collection_name)
queued = collection.count_documents({'status': 'QUEUED'})
pending = collection.count_documents({'status': 'PENDING'})
failed = collection.count_documents({'status': 'FAILED'})
killed = collection.count_documents({'status': 'KILLED'})
interrupted = collection.count_documents({'status': 'INTERRUPTED'})
running = collection.count_documents({'status': 'RUNNING'})
completed = collection.count_documents({'status': 'COMPLETED'})
title = f"********** Report for database collection '{db_collection_name}' **********"
logging.info(title)
logging.info(f"* - {queued:3d} queued experiment{s_if(queued)}")
logging.info(f"* - {pending:3d} pending experiment{s_if(pending)}")
logging.info(f"* - {running:3d} running experiment{s_if(running)}")
logging.info(f"* - {completed:3d} completed experiment{s_if(completed)}")
logging.info(f"* - {interrupted:3d} interrupted experiment{s_if(interrupted)}")
logging.info(f"* - {failed:3d} failed experiment{s_if(failed)}")
logging.info(f"* - {killed:3d} killed experiment{s_if(killed)}")
logging.info("*" * len(title))
collection = get_collection(db_collection_name)
queued = collection.count_documents({'status': 'QUEUED'})
pending = collection.count_documents({'status': 'PENDING'})
failed = collection.count_documents({'status': 'FAILED'})
killed = collection.count_documents({'status': 'KILLED'})
interrupted = collection.count_documents({'status': 'INTERRUPTED'})
running = collection.count_documents({'status': 'RUNNING'})
completed = collection.count_documents({'status': 'COMPLETED'})
title = f"********** Report for database collection '{db_collection_name}' **********"
logging.info(title)
logging.info(f"* - {queued:3d} queued experiment{s_if(queued)}")
logging.info(f"* - {pending:3d} pending experiment{s_if(pending)}")
logging.info(f"* - {running:3d} running experiment{s_if(running)}")
logging.info(f"* - {completed:3d} completed experiment{s_if(completed)}")
logging.info(f"* - {interrupted:3d} interrupted experiment{s_if(interrupted)}")
logging.info(f"* - {failed:3d} failed experiment{s_if(failed)}")
logging.info(f"* - {killed:3d} killed experiment{s_if(killed)}")
logging.info("*" * len(title))
# Backward compatibility, we used to store the path in 'slurm'
output_file = slurm_config['output_file']
else:
continue
with open(output_file, 'r') as f:
all_lines = f.readlines()
collection.update_one({'_id': exp['_id']}, {'$set': {'fail_trace': all_lines[-4:]}})
except IOError:
if 'output_file' in seml_config:
output_file = seml_config['output_file']
elif 'output_file' in slurm_config:
# Backward compatibility
output_file = slurm_config['output_file']
logging.warning(f"File {output_file} could not be read.")
if print_detected:
logging.info(f"Detected {nkilled} externally killed experiment{s_if(nkilled)}.")