Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
os._exit(1)
except zmq.error.Again as e:
logger.warning(
"[Job] Cannot connect to the client. This job will exit and inform the worker."
)
break
socket.close(0)
with self.lock:
self.kill_job_socket.send_multipart(
[remote_constants.KILLJOB_TAG,
to_byte(self.job_address)])
try:
_ = self.kill_job_socket.recv_multipart()
except zmq.error.Again as e:
pass
logger.warning("[Job]lost connection with the client, will exit")
os._exit(1)
def _reply_worker_heartbeat(self, socket):
"""create a socket that replies heartbeat signals from the worker.
If the worker has exited, the job will exit automatically.
"""
while True:
try:
message = socket.recv_multipart()
socket.send_multipart([remote_constants.HEARTBEAT_TAG])
except zmq.error.Again as e:
logger.warning("[Job] Cannot connect to the worker{}. ".format(
self.worker_address) + "Job will quit.")
break
socket.close(0)
os._exit(1)
self.start_time = time.time()
thread = threading.Thread(target=self._reply_heartbeat)
thread.setDaemon(True)
thread.start()
self.heartbeat_socket_initialized.wait()
# check if the master is connected properly
try:
self.submit_job_socket.send_multipart([
remote_constants.CLIENT_CONNECT_TAG,
to_byte(self.heartbeat_master_address),
to_byte(socket.gethostname())
])
_ = self.submit_job_socket.recv_multipart()
except zmq.error.Again as e:
logger.warning("[Client] Can not connect to the master, please "
"check if master is started and ensure the input "
"address {} is correct.".format(master_address))
self.master_is_alive = False
raise Exception("Client can not connect to the master, please "
"check if master is started and ensure the input "
"address {} is correct.".format(master_address))
self.heartbeat_master_address = "{}:{}".format(get_ip_address(),
heartbeat_master_port)
self.heartbeat_socket_initialized.set()
while self.client_is_alive and self.master_is_alive:
try:
message = socket.recv_multipart()
elapsed_time = datetime.timedelta(
seconds=int(time.time() - self.start_time))
socket.send_multipart([
remote_constants.HEARTBEAT_TAG,
to_byte(self.executable_path),
to_byte(str(self.actor_num)),
to_byte(str(elapsed_time))
])
except zmq.error.Again as e:
logger.warning("[Client] Cannot connect to the master."
"Please check if it is still alive.")
self.master_is_alive = False
socket.close(0)
logger.warning("Client exit replying heartbeat for master.")
def request_cpu_resource(self, global_client, max_memory):
"""Try to request cpu resource for 1 second/time for 300 times."""
cnt = 300
while cnt > 0:
job_address = global_client.submit_job(max_memory)
if job_address is not None:
return job_address
if cnt % 30 == 0:
logger.warning(
"No vacant cpu resources at the moment, "
"will try {} times later.".format(cnt))
cnt -= 1
return None
def _kill_job(self, job_address):
"""Kill a job process and update worker information"""
success = self.worker_status.remove_job(job_address)
if success:
while True:
initialized_job = self.job_buffer.get()
initialized_job.worker_address = self.master_heartbeat_address
if initialized_job.is_alive:
self.worker_status.add_job(initialized_job)
if not initialized_job.is_alive: # make sure that the job is still alive.
self.worker_status.remove_job(
initialized_job.job_address)
continue
else:
logger.warning(
"[Worker] a dead job found. The job buffer will not accept this one."
)
if initialized_job.is_alive:
break
self.lock.acquire()
self.request_master_socket.send_multipart([
remote_constants.NEW_JOB_TAG,
cloudpickle.dumps(initialized_job),
to_byte(job_address)
])
_ = self.request_master_socket.recv_multipart()
self.lock.release()
def call(*args, **kwargs):
global _writer
if _writer is None:
logdir = logger.get_dir()
if logdir is None:
logdir = logger.auto_set_dir(action='d')
logger.warning(
"[tensorboard] logdir is None, will save tensorboard files to {}"
.format(logdir))
_writer = SummaryWriter(logdir=logger.get_dir())
func = getattr(_writer, func_name)
func(*args, **kwargs)
_writer.flush()
3. A new client connects to the master node.
4. A connected client submits a job after a remote object is created.
"""
self.client_socket.linger = 0
self.client_socket.setsockopt(
zmq.RCVTIMEO, remote_constants.HEARTBEAT_RCVTIMEO_S * 1000)
while self.master_is_alive:
try:
self._receive_message()
pass
except zmq.error.Again as e:
#detect whether `self.master_is_alive` is True periodically
pass
logger.warning("[Master] Exit master.")
raise DeserializeError
else:
traceback_str = str(traceback.format_exc())
logger.error("traceback:\n{}".format(traceback_str))
reply_socket.send_multipart([
remote_constants.EXCEPTION_TAG,
to_byte(error_str + "\ntraceback:\n" +
traceback_str)
])
break
# receive DELETE_TAG from actor, and stop replying worker heartbeat
elif tag == remote_constants.KILLJOB_TAG:
reply_socket.send_multipart([remote_constants.NORMAL_TAG])
logger.warning("An actor exits and this job {} will exit.".
format(job_address))
break
else:
logger.error(
"The job receives an unknown message: {}".format(message))
raise NotImplementedError