Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for df in get_data(task):
idx = df.index
self.assertFalse(
idx.has_duplicates,
msg='%s task index data has duplicates!\n%s' % (
name, idx.values[idx.duplicated(keep=False)]))
if check_data_duplicates:
self.assertFalse(task.data.duplicated().any(),
msg='%s task data has duplicates!\n%s' % (
name, task.data[
task.data.duplicated(keep=False)]))
engine = task.engine
setup_from_db = False
if engine is not None:
sql_dtypes = task._split_kwargs(task.sql_dtypes)
for i, table in enumerate(safe_list(task.dbname)):
if table is not None:
setup_from_db = True
self.assertTrue(
engine.has_table(table),
msg='Database has no table %s of %s task' % (table,
name))
data = task._get_data(i)
data_cols = set(data.columns) | set(data.index.names)
self.assertEqual(set(sql_dtypes[i]) & data_cols,
data_cols,
msg='Missing sql dtype for %s' % name)
# check setup from file
if setup_from_file:
manager = self.organizer.param(
stations=self.stations_file, **kwargs)
def _get_setup(self):
if self._datafile and all(map(osp.exists, safe_list(self.datafile))):
return 'file'
if self.dbname:
engine = self.engine
if engine is not None and all(map(
engine.has_table, safe_list(self.dbname))):
return 'db'
return 'scratch'
def write2file(self, **kwargs):
"""Write the database to the :attr:`datafile` file"""
for i, (datafile, kws) in enumerate(zip(safe_list(self.datafile),
self._split_kwargs(kwargs))):
data = self._get_data(i)
if data is None or not len(data):
continue
lock = _file_locks.get(datafile)
if lock:
self.logger.debug('Acquiring lock...')
lock.acquire()
exists = osp.exists(datafile)
self.logger.debug('Writing data to %sexisting file %s',
'not ' if not exists else '', datafile)
try:
safe_csv_append(data, datafile, **kws)
except:
raise
finally:
def _get_setup(self):
if self._datafile and all(map(osp.exists, safe_list(self.datafile))):
return 'file'
if self.dbname:
engine = self.engine
if engine is not None and all(map(
engine.has_table, safe_list(self.dbname))):
return 'db'
return 'scratch'
else:
stations = [orig_stations]
# The real number of stations list. It might happen that we
# have more processors than stations which then results in
# empty arrays in `stations`
nstations_lists = next((i for i, l in enumerate(stations)
if len(l) == 0), len(stations))
# make sure we don't send a list of empty stations to a process
stations = stations[:nstations_lists]
nprocs = min(nprocs, nstations_lists)
if scheduler is None:
# create locks
for task in self.tasks:
for fname in safe_list(task.datafile):
_file_locks[fname] = mp.Lock()
for dbname in safe_list(task.dbname):
_db_locks[dbname] = mp.Lock()
# start the pool
logger.debug(
'Starting %s processes for %s station lists',
nprocs, len(stations))
pool = mp.Pool(nprocs, initializer=init_locks,
initargs=(_db_locks, _file_locks))
else:
file_locks = list(chain(*(
safe_list(task.datafile) for task in self.tasks)))
db_locks = list(chain(*(
safe_list(task.datafile) for task in self.tasks)))
if i != len(grouped):
unsafe = list(chain(*grouped[i+1::2]))
_to_return = to_return + list(chain(*(
t.setup_requires for t in chain(*unsafe[1::2]))))
def setup_from_file(self, **kwargs):
"""Set up the task from already stored files"""
kwargs = self._split_kwargs(kwargs)
chunksize = self.global_config.get('chunksize', 10 ** 5)
for i, datafile in enumerate(safe_list(self.datafile)):
if not self.task_config.skip_filtering:
data = []
for all_data in pd.read_csv(datafile, chunksize=chunksize,
**kwargs[i]):
if 'id' in all_data.columns:
all_data.set_index('id', inplace=True)
stations = list(self.stations)
if len(all_data.index.names) == 1:
data.append(all_data.loc(axis=0)[stations])
else:
names = all_data.index.names
axis = names.index('id')
key = [slice(None) for _ in range(axis)] + [
stations] + [
slice(None) for _ in range(
axis, len(names) - 1)]
if scheduler is None:
# create locks
for task in self.tasks:
for fname in safe_list(task.datafile):
_file_locks[fname] = mp.Lock()
for dbname in safe_list(task.dbname):
_db_locks[dbname] = mp.Lock()
# start the pool
logger.debug(
'Starting %s processes for %s station lists',
nprocs, len(stations))
pool = mp.Pool(nprocs, initializer=init_locks,
initargs=(_db_locks, _file_locks))
else:
file_locks = list(chain(*(
safe_list(task.datafile) for task in self.tasks)))
db_locks = list(chain(*(
safe_list(task.datafile) for task in self.tasks)))
if i != len(grouped):
unsafe = list(chain(*grouped[i+1::2]))
_to_return = to_return + list(chain(*(
t.setup_requires for t in chain(*unsafe[1::2]))))
else:
_to_return = to_return
args = [[s, _to_return, True] for s in stations]
# start the computation
if scheduler is not None:
try:
kws = {'workers': set(client.cluster.workers[:nprocs])}
except AttributeError:
kws = {}
for proc_args in args:
import psyplot.project as psy
self.logger.debug(' Loading existing project %s', inproject)
sp = psy.Project.load_project(inproject, datasets=ds_list)
else:
self.logger.debug(' Creating project...')
sp = self.create_project(ds_orig)
# ---- save data and project
pdf = sp.export(plot_output, tight=True, close_pdf=False)
if project_output:
self.logger.debug(' Saving project to %s', project_output)
if nc_output:
for f in safe_list(nc_output):
if osp.exists(f):
os.remove(f)
save_kws = dict(use_rel_paths=True, paths=safe_list(nc_output))
else: # save the entire dataset into the pickle file
save_kws = dict(ds_description={'ds'})
sp.save_project(project_output, **save_kws)
# ---- make plots not covered by psyplot
self.plot_additionals(pdf)
# ---- configure the experiment
self.make_run_config(sp, info, *args, **kwargs)
# ---- export the figures
self.logger.debug(' Saving plots to %s', plot_output)
pdf.close()
# ---- close the project
if kwargs.get('close', True) or self.task_config.close:
sp.close(True, True, True)
def setup_from_db(self, **kwargs):
"""Set up the task from datatables already created"""
kwargs = self._split_kwargs(kwargs)
for i, dbname in enumerate(safe_list(self.dbname)):
if self.task_config.skip_filtering:
self._set_data(
pd.read_sql_query("SELECT * FROM %s" % (dbname, ),
self.engine, **kwargs[i]),
i)
else:
self._set_data(pd.read_sql_query(
"SELECT * FROM %s WHERE id IN (%s)" % (
dbname, ', '.join(map("'{0}'".format, self.stations))),
self.engine, **kwargs[i]), i)