Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""
# path to the n5 file and keys
path = luigi.Parameter()
aff_key = luigi.Parameter()
ws_key = luigi.Parameter()
out_key = luigi.Parameter()
# dummy parameter to be consistent with other segmentation tasks
max_jobs = luigi.IntParameter()
# path to the configuration
config_path = luigi.Parameter()
tmp_folder = luigi.Parameter()
dependency = luigi.TaskParameter()
# FIXME default does not work; this still needs to be specified
time_estimate = luigi.IntParameter(default=10)
run_local = luigi.BoolParameter(default=False)
def requires(self):
return self.dependency
def _collect_outputs(self):
res_path = os.path.join(self.tmp_folder, 'multicut_time.json')
try:
assert os.path.exists(res_path)
with open(res_path) as f:
t = json.load(f)['t']
os.remove(res_path)
except Exception:
return None
return t
def run(self):
yield int(streams), artist
class ArtistToplistToDatabase(luigi.contrib.postgres.CopyToTable):
"""
This task runs a :py:class:`luigi.contrib.postgres.CopyToTable` task
over the target data returned by :py:meth:`~/.Top10Artists.output` and
writes the result into its :py:meth:`~.ArtistToplistToDatabase.output` target which,
by default, is :py:class:`luigi.contrib.postgres.PostgresTarget` (a table in PostgreSQL).
This class uses :py:meth:`luigi.contrib.postgres.CopyToTable.run`
and :py:meth:`luigi.contrib.postgres.CopyToTable.output`.
"""
date_interval = luigi.DateIntervalParameter()
use_spark = luigi.BoolParameter()
host = "localhost"
database = "toplists"
user = "luigi"
password = "abc123" # ;)
table = "top10"
columns = [("date_from", "DATE"),
("date_to", "DATE"),
("artist", "TEXT"),
("streams", "INT")]
def requires(self):
"""
This task's dependencies:
"OWNER": f"arn:aws:iam::{self.account_id}:role/servicecatalog-puppet/PuppetRole"
},
)
self.write_output(changes_made)
class RunDeployInSpokeTask(tasks.PuppetTask):
manifest_file_path = luigi.Parameter()
puppet_account_id = luigi.Parameter()
account_id = luigi.Parameter()
home_region = luigi.Parameter()
regions = luigi.ListParameter()
should_collect_cloudformation_events = luigi.BoolParameter()
should_forward_events_to_eventbridge = luigi.BoolParameter()
should_forward_failures_to_opscenter = luigi.BoolParameter()
def params_for_results_display(self):
return {
"manifest_file_path": self.manifest_file_path,
"puppet_account_id": self.puppet_account_id,
"account_id": self.account_id,
}
def run(self):
with betterboto_client.CrossAccountClientContextManager(
"s3",
f"arn:aws:iam::{self.puppet_account_id}:role/servicecatalog-puppet/PuppetRole",
f"s3-{self.puppet_account_id}",
) as s3:
bucket = f"sc-puppet-spoke-deploy-{self.puppet_account_id}"
key = f"{os.getenv('CODEBUILD_BUILD_NUMBER', '0')}.yaml"
class InitialSubgraphTask(luigi.Task):
"""
Compute initial sub-graphs
"""
path = luigi.Parameter()
ws_key = luigi.Parameter()
out_path = luigi.Parameter()
max_jobs = luigi.Parameter()
config_path = luigi.Parameter()
tmp_folder = luigi.Parameter()
dependency = luigi.TaskParameter()
# FIXME default does not work; this still needs to be specified
time_estimate = luigi.IntParameter(default=10)
run_local = luigi.BoolParameter(default=False)
def requires(self):
return self.dependency
def _prepare_jobs(self, n_jobs, block_list, block_shape):
for job_id in range(n_jobs):
block_jobs = block_list[job_id::n_jobs]
job_config = {'block_shape': block_shape,
'block_list': block_jobs}
config_path = os.path.join(self.tmp_folder, 'initial_subgraph_config_job%i.json' % job_id)
with open(config_path, 'w') as f:
json.dump(job_config, f)
def _submit_job(self, job_id):
script_path = os.path.join(self.tmp_folder, 'initial_subgraph.py')
config_path = os.path.join(self.tmp_folder, 'initial_subgraph_config_job%i.json' % job_id)
corpus_name = corpus_suffix.replace('.zip', '.sqlite')
return luigi.LocalTarget(path.join(self.data_dir, corpus_name))
def run(self):
try:
corpus.build_corpus(self.output().path + '.tmp', self.input()['corpus'].path)
os.rename(self.output().path + '.tmp', self.output().path)
except:
os.system("rm -rf '%s'" % self.output().path + '.tmp')
raise
class CreateFeaturizer(SharedParameters):
training_fraction = luigi.FloatParameter(default=0.8)
use_bigrams = luigi.BoolParameter(default=False)
use_unigrams = luigi.BoolParameter(default=True)
max_features = luigi.IntParameter(default=100000000)
name = luigi.Parameter('default')
def requires(self):
return {'corpus': BuildCorpus()}
def output(self):
return luigi.LocalTarget(
path.join(self.model_dir, 'featurizer-%s.pickle' % self.name)
)
def run(self):
logger.info(
"Loading corpus from file %s " % self.input()['corpus'].path
)
c = corpus.Corpus.load(self.input()['corpus'].path, self.training_fraction)
key_prefix = 'nih_abstracts_processed/mti'
class MeshJoinTask(luigi.Task):
'''Joins MeSH labels stored in S3 to NIH projects in MySQL.
Args:
date (str): Date used to label the outputs
_routine_id (str): String used to label the AWS task
db_config_env (str): Environment variable for path to MySQL database
configuration.
'''
date = luigi.DateParameter()
_routine_id = luigi.Parameter()
db_config_env = luigi.Parameter()
test = luigi.BoolParameter()
@staticmethod
def format_mesh_terms(df):
"""
Removes unrequired columns and pivots the mesh terms data into a dictionary.
Args:
df (dataframe): mesh terms as returned from retrieve_mesh_terms
Returns:
(dict): document_id: list of mesh terms
"""
logging.info("Formatting mesh terms")
# remove PRC rows
df = df.drop(df[df.term == 'PRC'].index, axis=0)
import luigi
import z5py
from cremi_tools.skeletons import build_skeleton_metrics
from production.util import DummyTask
class SkeletonEvaluationTask(luigi.Task):
path = luigi.Parameter()
seg_key = luigi.Parameter()
skeleton_keys = luigi.ListParameter()
n_threads = luigi.IntParameter()
tmp_folder = luigi.Parameter()
dependency = luigi.TaskParameter(default=DummyTask())
time_estimate = luigi.IntParameter(default=10)
run_local = luigi.BoolParameter(default=False)
def requires(self):
return self.dependency
# TODO enable ROIs
def run(self):
from .. import util
# copy the script to the temp folder and replace the shebang
file_dir = os.path.dirname(os.path.abspath(__file__))
script_path = os.path.join(self.tmp_folder, 'skeleton_evaluation.py')
util.copy_and_replace(os.path.join(file_dir, 'skeleton_evaluation.py'),
script_path)
# check that inputs exist
defined as the most frequently occurring from a set of categories.
Args:
db_config_env (str): Environmental variable pointing to the path of the DB config.
routine_id (str): The routine UID.
core_categories (list): A list of category_shortnames from which to identify topics.
members_perc (int): A percentile to evaluate the minimum number of members.
topic_perc (int): A percentile to evaluate the most frequent topics.
test (bool): Test mode.
'''
db_config_env = luigi.Parameter()
routine_id = luigi.Parameter()
core_categories = luigi.ListParameter()
members_perc = luigi.IntParameter(default=10)
topic_perc = luigi.IntParameter(default=10)
test = luigi.BoolParameter(default=True)
def output(self):
'''Points to the S3 Target'''
return s3.S3Target(f"{S3PREFIX}/meetup-topics-{self.routine_id}.json")
def run(self):
'''Extract the topics of interest'''
database = 'dev' if self.test else 'production'
engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)
members_limit = get_members_by_percentile(engine, perc=self.members_perc)
topics = get_core_topics(engine,
core_categories=self.core_categories,
members_limit=members_limit,
perc=self.topic_perc)
# Write the intermediate output
# path to the n5 file and keys
path = luigi.Parameter()
aff_key = luigi.Parameter()
mask_key = luigi.Parameter()
out_key = luigi.Parameter()
# maximal number of jobs that will be run in parallel
max_jobs = luigi.IntParameter()
# path to the configuration
# TODO allow individual paths for individual blocks
config_path = luigi.Parameter()
tmp_folder = luigi.Parameter()
# FIXME default does not work; this still needs to be specified
# TODO different time estimates for different sub-tasks
time_estimate = luigi.IntParameter(default=10)
run_local = luigi.BoolParameter(default=False)
def requires(self):
thresh_task = ThresholdTask(path=self.path, aff_key=self.aff_key,
mask_key=self.mask_key, out_key=self.out_key,
max_jobs=self.max_jobs, config_path=self.config_path,
tmp_folder=self.tmp_folder, time_estimate=self.time_estimate,
run_local=self.run_local)
offset_task = OffsetTask(tmp_folder=self.tmp_folder, dependency=thresh_task,
time_estimate=self.time_estimate, run_local=self.run_local)
merge_task = MergeTask(path=self.path, out_key=self.out_key, config_path=self.config_path,
max_jobs=self.max_jobs, tmp_folder=self.tmp_folder,
dependency=offset_task,
time_estimate=self.time_estimate, run_local=self.run_local)
assignment_task = NodeAssignmentTask(path=self.path, out_key=self.out_key, config_path=self.config_path,
max_jobs=self.max_jobs, tmp_folder=self.tmp_folder,
#
# Block-wise gradient computation tasks
#
class GradientsBase(luigi.Task):
""" Gradients base class
"""
task_name = 'gradients'
src_file = os.path.abspath(__file__)
allow_retry = True
path_dict = luigi.Parameter()
output_path = luigi.Parameter()
output_key = luigi.Parameter()
average_gradient = luigi.BoolParameter(default=True)
dependency = luigi.TaskParameter(default=DummyTask())
def requires(self):
return self.dependency
def _validate_paths(self):
shape = None
with open(self.path_dict) as f:
path_dict = json.load(f)
for path in sorted(path_dict):
key = path_dict[path]
assert os.path.exists(path)
with vu.file_reader(path, 'r') as f:
assert key in f