Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __init__(self, profile=None, cluster_id=None):
self.client = ipp.Client(profile=profile, cluster_id=cluster_id)
self.statusDict = {}
self.sleepSeconds = SLEEP_SECONDS
self.keyField = 'key'
def test_meta_param(ma2):
sim = ma2.get_reference('MA2')
# Test that it is passed
try:
# Add to state
sim['_uses_meta'] = True
sim.generate()
assert False, "Should raise an error"
except TypeError:
assert True
except ipyparallel.error.RemoteError:
assert True
get_ipython().run_line_magic('ipcluster', '--version')
get_ipython().run_line_magic('ipcluster', 'start -n 2 --mpi')
# Repeat a few of times in case of `TimeoutError`.
# After the cluser starts, the following calls won't do nothing
# but printing "IPCluster is already running".
# This mimics what the user would do in such case.
get_ipython().run_line_magic('ipcluster', 'start -n 2 --mpi')
get_ipython().run_line_magic('ipcluster', 'start -n 2 --mpi')
get_ipython().run_line_magic('ipcluster', 'start -n 2 --mpi')
get_ipython().run_line_magic('ipcluster', 'start -n 2 --mpi')
c = ipp.Client()
print('cluster ids:', c.ids)
get_ipython().run_cell_magic('px', '', 'import os\nprint(os.popen("ps -u $USER | grep ip").read())')
get_ipython().run_cell_magic('px', '', 'import socket\nsocket.gethostname()')
get_ipython().run_cell_magic('px', '', 'import numpy as np\nimport tensorflow as tf\nimport horovod.tensorflow as hvd')
get_ipython().run_cell_magic('px', '', 'hvd.init()')
get_ipython().run_cell_magic('px', '', '# Note that the generated rando data is different from one node to the other\nnsamples = 1000\nref_slope = 2.0\nref_offset = 0.0\nnoise = np.random.random((nsamples, 1)) - 0.5\nx_train = np.random.random((nsamples, 1)) - 0.5\ny_train = ref_slope * x_train + ref_offset + noise')
get_ipython().run_cell_magic('px', '', '#input pipeline\ndataset = tf.data.Dataset.from_tensor_slices((x_train.astype(np.float32),\n y_train.astype(np.float32)))\ndataset = dataset.shard(hvd.size(), hvd.rank())\ndataset = dataset.batch(500)\ndataset = dataset.repeat(500)\niterator = dataset.make_one_shot_iterator()\nnext_item = iterator.get_next()')
get_ipython().run_cell_magic('px', '', '# Define the model\nslope = tf.Variable(np.random.randn())\noffset = tf.Variable(np.random.randn())\n\nx, y = next_item # The model is the continuation of the pipeline\n\ny_hat = slope * x + offset\n\nloss = tf.losses.mean_squared_error(y_hat, y)\n\nopt = tf.train.GradientDescentOptimizer(.5)\ntrain = hvd.DistributedOptimizer(opt).minimize(loss)')
if ipcluster == "ipcluster":
p1 = subprocess.Popen("ipcluster start -n {0}".format(ncpus), shell=True, close_fds=(os.name != 'nt'))
else:
p1 = subprocess.Popen(shlex.split("{0} start -n {1}".format(ipcluster, ncpus)), shell=True, close_fds=(os.name != 'nt'))
#
while True:
try:
c = ipyparallel.Client()
if len(c) < ncpus:
sys.stdout.write(".")
sys.stdout.flush()
raise ipyparallel.error.TimeoutError
c.close()
break
except (IOError, ipyparallel.error.TimeoutError):
sys.stdout.write(".")
sys.stdout.flush()
time.sleep(1)
else:
shell_source(slurm_script)
pdir, profile = os.environ['IPPPDIR'], os.environ['IPPPROFILE']
c = Client(ipython_dir=pdir, profile=profile)
ee = c[:]
ne = len(ee)
print 'Running on %d engines.' % (ne)
c.close()
sys.stdout.write(" done\n")
sys.stdout.flush()
ncpus=psutil.cpu_count()
if slurm_script is None:
if ipcluster == "ipcluster":
p1 = subprocess.Popen("ipcluster start -n {0}".format(ncpus), shell=True, close_fds=(os.name != 'nt'))
else:
p1 = subprocess.Popen(shlex.split("{0} start -n {1}".format(ipcluster, ncpus)), shell=True, close_fds=(os.name != 'nt'))
#
while True:
try:
c = ipyparallel.Client()
if len(c) < ncpus:
sys.stdout.write(".")
sys.stdout.flush()
raise ipyparallel.error.TimeoutError
c.close()
break
except (IOError, ipyparallel.error.TimeoutError):
sys.stdout.write(".")
sys.stdout.flush()
time.sleep(1)
else:
shell_source(slurm_script)
pdir, profile = os.environ['IPPPDIR'], os.environ['IPPPROFILE']
c = Client(ipython_dir=pdir, profile=profile)
ee = c[:]
ne = len(ee)
print 'Running on %d engines.' % (ne)
c.close()
sub = ctx.socket(zmq.SUB)
sub.setsockopt(zmq.SUBSCRIBE, b"")
sub.bind(self.monitor_url)
sub.bind('inproc://monitor')
sub = ZMQStream(sub, loop)
# connect the db
db_class = _db_shortcuts.get(self.db_class.lower(), self.db_class)
self.log.info('Hub using DB backend: %r', (db_class.split('.')[-1]))
self.db = import_item(str(db_class))(session=self.session.session,
parent=self, log=self.log)
time.sleep(.25)
# resubmit stream
r = ZMQStream(ctx.socket(zmq.DEALER), loop)
url = util.disambiguate_url(self.client_url('task'))
r.connect(url)
self.hub = Hub(loop=loop, session=self.session, monitor=sub, heartmonitor=self.heartmonitor,
query=q, notifier=n, resubmit=r, db=self.db,
engine_info=self.engine_info, client_info=self.client_info,
log=self.log, registration_timeout=self.registration_timeout,
parent=self,
)
job_array_regexp = CRegExp('#PBS\W+-J\W+[\w\d\-\$]+')
job_array_template = Unicode('')
def stop(self):
job_ids = self.job_id.split(";")
for job in job_ids:
subprocess.check_call("qdel %s" % job, shell=True)
def notify_start(self, data):
self.log.debug('Process %r started: %r', self.args[0], data)
self.start_data = data
self.state = 'running'
self.job_id = data
return data
class BcbioPBSPROEngineSetLauncher(PBSPROLauncher, launcher.BatchClusterAppMixin):
"""Launch Engines using PBSPro"""
batch_file_name = Unicode('pbspro_engines' + str(uuid.uuid4()),
config=True,
help="batch file name for the engine(s) job.")
tag = traitlets.Unicode("", config=True)
cores = traitlets.Integer(1, config=True)
mem = traitlets.Unicode("", config=True)
numengines = traitlets.Integer(1, config=True)
resources = traitlets.Unicode("", config=True)
default_template = Unicode(u"""#!/bin/sh
#PBS -V
#PBS -S /bin/sh
#PBS -N {tag}-e
{resources}
{exports}
self.context["account"] = self.account
self.context["timelimit"] = self.timelimit
self.context["cores"] = self.cores
if self.mem:
self.context["mem"] = "#SBATCH --mem=%s\n" % int(float(self.mem) * 1024.0)
else:
self.context["mem"] = "#SBATCH --mem=%d\n" % (4 * DEFAULT_MEM_PER_CPU)
self.context["tag"] = self.tag if self.tag else "bcbio"
self.context["account"] = ("#SBATCH -A %s\n" % self.account if self.account else "")
self.context["resources"] = "\n".join(["#SBATCH --%s\n" % r.strip()
for r in str(self.resources).split(";")
if r.strip()])
return super(BcbioSLURMControllerLauncher, self).start(1)
class BcbioOLDSLURMEngineSetLauncher(SLURMLauncher, launcher.BatchClusterAppMixin):
"""Launch engines using SLURM for version < 2.6"""
machines = traitlets.Integer(1, config=True)
account = traitlets.Unicode("", config=True)
timelimit = traitlets.Unicode("", config=True)
batch_file_name = Unicode("SLURM_engines" + str(uuid.uuid4()),
config=True, help="batch file name for the engine(s) job.")
default_template = Unicode(u"""#!/bin/sh
#SBATCH -A {account}
#SBATCH --job-name ipengine
#SBATCH -N {machines}
#SBATCH -t {timelimit}
export IPYTHONDIR={profile_dir}
srun -N {machines} -n {n} %s %s --profile-dir="{profile_dir}" --cluster-id="{cluster_id}"
""" % (' '.join(map(pipes.quote, engine_cmd_argv)),
' '.join(timeout_params)))
If `timeout` is not ``None`` and the result does not arrive within
`timeout` seconds then ``TimeoutError`` is raised. If the
remote call raised an exception then that exception will be reraised
by get() inside a `RemoteError`.
"""
if not self.ready():
self.wait(timeout)
if self._ready:
if self._success:
return self.result()
else:
raise self.exception()
else:
raise error.TimeoutError("Result not ready.")
timeout = None
# wait for Future to indicate send having been called,
# which means MessageTracker is ready.
tic = time.time()
if not self._sent_event.wait(timeout):
raise error.TimeoutError("Still waiting to be sent")
return False
if timeout:
timeout = max(0, timeout - (time.time() - tic))
try:
if timeout is None:
# MessageTracker doesn't like timeout=None
timeout = -1
return self._tracker.wait(timeout)
except zmq.NotDone:
raise error.TimeoutError("Still waiting to be sent")