Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def output(self):
return self.make_target(self.output_file_path)
def run(self):
categorical_column_names = list(self.categorical_column_names)
data = self.load_data_frame(required_columns=set(categorical_column_names))
for c in self.categorical_column_names:
data[c] = data[c].astype('category')
self.dump(data)
class SplitTrainTestData(gokart.TaskOnKart):
task_namespace = 'redshells.data_frame_utils'
data_task = gokart.TaskInstanceParameter()
test_size_rate = luigi.FloatParameter()
train_output_file_path = luigi.Parameter(default='data/train_data.pkl') # type: str
test_output_file_path = luigi.Parameter(default='data/test_data.pkl') # type: str
def requires(self):
return self.data_task
def output(self):
return dict(
train=self.make_target(self.train_output_file_path), test=self.make_target(self.test_output_file_path))
def run(self):
data = self.load_data_frame()
data = sklearn.utils.shuffle(data)
train, test = sklearn.model_selection.train_test_split(data, test_size=self.test_size_rate)
self.dump(train, 'train')
self.dump(test, 'test')
subprocess.call(['bkill', job_id])
class LSFJobTask(luigi.Task):
"""
Takes care of uploading and executing an LSF job
"""
n_cpu_flag = luigi.IntParameter(default=2, significant=False)
shared_tmp_dir = luigi.Parameter(default='/tmp', significant=False)
resource_flag = luigi.Parameter(default='mem=8192', significant=False)
memory_flag = luigi.Parameter(default='8192', significant=False)
queue_flag = luigi.Parameter(default='queue_name', significant=False)
runtime_flag = luigi.IntParameter(default=60)
job_name_flag = luigi.Parameter(default='')
poll_time = luigi.FloatParameter(
significant=False, default=5,
description="specify the wait time to poll bjobs for the job status")
save_job_info = luigi.BoolParameter(default=False)
output = luigi.Parameter(default='')
extra_bsub_args = luigi.Parameter(default='')
job_status = None
def fetch_task_failures(self):
"""
Read in the error file from bsub
"""
error_file = os.path.join(self.tmp_dir, "job.err")
if os.path.isfile(error_file):
with open(error_file, "r") as f_err:
errors = f_err.readlines()
distances = pickle.load(f)
res_dict.update(distances)
with open(self.output_path, 'wb') as f:
pickle.dump(res_dict, f)
def output(self):
return luigi.LocalTarget(self.output_path)
class PairwiseDistanceWorkflow(WorkflowBase):
input_path = luigi.Parameter()
input_key = luigi.Parameter()
morphology_path = luigi.Parameter()
morphology_key = luigi.Parameter()
output_path = luigi.Parameter()
max_distance = luigi.FloatParameter()
resolution = luigi.ListParameter()
max_size = luigi.IntParameter(default=None)
def requires(self):
distance_task = getattr(distance_tasks,
self._get_task_name('ObjectDistances'))
dep = distance_task(tmp_folder=self.tmp_folder, max_jobs=self.max_jobs,
config_dir=self.config_dir,
input_path=self.input_path, input_key=self.input_key,
morphology_path=self.morphology_path,
morphology_key=self.morphology_key,
max_distance=self.max_distance, resolution=self.resolution,
max_size=self.max_size)
dep = MergePairwiseDistances(tmp_folder=self.tmp_folder, max_jobs=self.max_jobs,
output_path=self.output_path, dependency=dep)
return dep
from collections import defaultdict
from logging import getLogger
import luigi
import numpy as np
import gokart
logger = getLogger(__name__)
class FilterItemByWordSimilarity(gokart.TaskOnKart):
word2items_task = gokart.TaskInstanceParameter()
word2embedding_task = gokart.TaskInstanceParameter()
item2title_embedding_task = gokart.TaskInstanceParameter()
no_below = luigi.FloatParameter()
output_file_path = luigi.Parameter(
default='app/word_item_similarity/filter_item_by_word_similarity.pkl') # type: str
def requires(self):
return dict(
word2items=self.word2items_task,
word2embedding=self.word2embedding_task,
item2title_embedding=self.item2title_embedding_task)
def output(self):
return self.make_target(self.output_file_path)
def run(self):
word2items = self.load('word2items')
word2embedding = self.load('word2embedding')
item2title_embedding = self.load('item2title_embedding')
no_evaluate_dependency = luigi.BoolParameter(default=False)
# AugustusTM(R) parameters
augustus = luigi.BoolParameter(default=False)
augustus_species = luigi.Parameter(default='human', significant=False)
augustus_hints_db = luigi.Parameter(default=None)
tm_cfg = luigi.Parameter(default='augustus_cfgs/extrinsic.ETM1.cfg', significant=False)
tmr_cfg = luigi.Parameter(default='augustus_cfgs/extrinsic.ETM2.cfg', significant=False)
# AugustusCGP parameters
augustus_cgp = luigi.BoolParameter(default=False)
cgp_param = luigi.Parameter(default='augustus_cfgs/log_reg_parameters_default.cfg', significant=False)
augustus_cgp_cfg_template = luigi.Parameter(default='augustus_cfgs/cgp_extrinsic_template.cfg', significant=False)
maf_chunksize = luigi.IntParameter(default=2500000, significant=False)
maf_overlap = luigi.IntParameter(default=500000, significant=False)
# consensus options
resolve_split_genes = luigi.BoolParameter(default=False)
cgp_splice_support = luigi.FloatParameter(default=0.8, significant=False)
cgp_num_exons = luigi.IntParameter(default=3, significant=False)
# Toil options
batchSystem = luigi.Parameter(default='singleMachine', significant=False)
maxCores = luigi.IntParameter(default=32, significant=False)
logLevel = luigi.Parameter(default='WARNING', significant=False) # this is passed to toil
cleanWorkDir = luigi.Parameter(default='onSuccess', significant=False) # debugging option
parasolCommand = luigi.Parameter(default=None, significant=False)
defaultMemory = luigi.IntParameter(default=8 * 1024 ** 3, significant=False)
workDir = luigi.Parameter(default=None, significant=False)
disableCaching = luigi.BoolParameter(default=False, significant=False)
def __repr__(self):
"""override the repr to make logging cleaner"""
# we are in a genome-specific task, so say so
if hasattr(self, 'genome'):
return 'Task: {} for {}'.format(self.__class__.__name__, self.genome)
os.getenv("GPDB_PORT")
)
register_adapter(QuotedIdentifier, lambda x: x)
class DatabaseConfig(luigi.Config):
base_table=luigi.Parameter()
feature_input_table=luigi.Parameter()
pca_input_base_table=luigi.Parameter()
pca_output_base_table=luigi.Parameter()
outlier_base_table=luigi.Parameter()
class ModelConfig(luigi.Config):
user_col=luigi.Parameter()
percentage_val=luigi.FloatParameter()
threshold = luigi.FloatParameter()
#some default params
TARGET_PATH=os.path.join(os.path.dirname(__file__),'target/{feature}_{date}'.format(
feature=ModelConfig().user_col,
date=date.today())
)
#config classes
class PathConfig(luigi.Config):
target_path=luigi.Parameter(default=TARGET_PATH)
def table_names_dict(id):
pca_tables = {
'hour':id,
'pca_input':DatabaseConfig().pca_input_base_table+'_{}'.format(id),
'pca_output':DatabaseConfig().pca_output_base_table+'_{}'.format(id),
import gokart
from redshells.model import FeatureAggregationSimilarityModel
from redshells.model.feature_aggregation_similarity_model import FeatureAggregationSimilarityDataset
logger = getLogger(__name__)
class TrainFeatureAggregationSimilarityModel(gokart.TaskOnKart):
dataset_task = gokart.TaskInstanceParameter(description='An instance of task which outputs `FeatureAggregationSimilarityDataset`.')
embedding_size = luigi.IntParameter() # type: int
learning_rate = luigi.FloatParameter() # type: float
batch_size = luigi.IntParameter() # type: int
epoch_size = luigi.IntParameter() # type: int
test_size_rate = luigi.FloatParameter() # type: float
early_stopping_patience = luigi.IntParameter() # type: int
max_data_size = luigi.IntParameter() # type: int
output_file_path = luigi.Parameter(default='model/feature_aggregation)similarity_model.pkl') # type: str
def requires(self):
return self.dataset_task
def output(self):
return self.make_target(self.output_file_path)
def run(self):
dataset = self.load() # type: FeatureAggregationSimilarityDataset
feature_size = dataset.x_item_features.shape[1]
item_size = max(np.max(dataset.x_item_indices), np.max(dataset.y_item_indices))
max_feature_index = max(np.max(dataset.x_item_features), np.max(dataset.y_item_features))
workflow_proxy_cls = HTCondorWorkflowProxy
pool = luigi.Parameter(default=NO_STR, significant=False, description="target htcondor pool")
scheduler = luigi.Parameter(default=NO_STR, significant=False, description="target htcondor "
"scheduler")
retries = luigi.IntParameter(default=5, significant=False, description="number of automatic "
"resubmission attempts per job, default: 5")
tasks_per_job = luigi.IntParameter(default=1, significant=False, description="number of tasks "
"to be processed by one job, default: 1")
only_missing = luigi.BoolParameter(significant=False, description="skip tasks that are "
"considered complete")
no_poll = luigi.BoolParameter(significant=False, description="just submit, do not initiate "
"status polling after submission")
threads = luigi.IntParameter(default=4, significant=False, description="number of threads to "
"use for (re)submission and status queries, default: 4")
interval = luigi.FloatParameter(default=3, significant=False, description="time between status "
"polls in minutes, default: 3")
walltime = luigi.FloatParameter(default=48, significant=False, description="maximum wall time "
"in hours, default: 48")
max_poll_fails = luigi.IntParameter(default=5, significant=False, description="maximum number "
"of consecutive errors during polling, default: 5")
cancel_jobs = luigi.BoolParameter(default=False, description="cancel all submitted jobs, no "
"new submission")
transfer_logs = luigi.BoolParameter(significant=False, description="transfer job logs to the "
"output directory")
exclude_params_branch = {"pool", "scheduler", "retries", "tasks_per_job", "only_missing",
"no_poll", "threads", "interval", "walltime", "max_poll_fails", "cancel_jobs",
"transfer_logs"}
@abstractmethod
def htcondor_output_directory(self):
vertices = luigi.TupleParameter(default=(5, 5))
method = luigi.EnumParameter(enum=Method, default=Method.SHEAR)
pixel_quality = luigi.BoolParameter()
land_sea_path = luigi.Parameter()
aerosol = luigi.DictParameter(default={'user': 0.05}, significant=False)
brdf_path = luigi.Parameter(significant=False)
brdf_premodis_path = luigi.Parameter(significant=False)
ozone_path = luigi.Parameter(significant=False)
water_vapour = luigi.DictParameter(default={'user': 1.5},
significant=False)
ecmwf_path = luigi.Parameter(significant=False)
invariant_height_fname = luigi.Parameter(significant=False)
dsm_fname = luigi.Parameter(significant=False)
modtran_exe = luigi.Parameter(significant=False)
tle_path = luigi.Parameter(significant=False)
rori = luigi.FloatParameter(default=0.52, significant=False)
compression = luigi.EnumParameter(enum=H5CompressionFilter,
default=H5CompressionFilter.LZF,
significant=False)
filter_opts = luigi.DictParameter(default=None, significant=False)
acq_parser_hint = luigi.OptionalParameter(default='')
buffer_distance = luigi.FloatParameter(default=8000, significant=False)
h5_driver = luigi.OptionalParameter(default='', significant=False)
def output(self):
fmt = '{label}.wagl.h5'
label = self.granule if self.granule else basename(self.level1)
out_fname = fmt.format(label=label)
return luigi.LocalTarget(pjoin(self.outdir, out_fname))
def run(self):