Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
raise RuntimeError(
"The ith node in the cluster should have a "
"custom resource called 'i' with quantity "
"500. The nodes are\n%s", ray.nodes())
if not ([[
resource_quantity
for resource_name, resource_quantity in node["Resources"].items()
if resource_name != "CPU"
] for node in ray.nodes()] == num_nodes * [[500.0]]):
raise RuntimeError(
"The ith node in the cluster should have a "
"custom resource called 'i' with quantity "
"500. The nodes are\n%s", ray.nodes())
for node in ray.nodes():
if ("0" in node["Resources"] and node["ObjectStoreSocketName"] !=
ray.worker.global_worker.plasma_client.store_socket_name):
raise RuntimeError("The node that this driver is connected to "
"must have a custom resource labeled '0'.")
def remaining_processes_alive():
"""See if the remaining processes are alive or not.
Note that this ignores processes that have been explicitly killed,
e.g., via a command like node.kill_raylet().
Returns:
True if the remaining processes started by ray.init() are alive and
False otherwise.
Raises:
Exception: An exception is raised if the processes were not started by
ray.init().
"""
if ray.worker._global_node is None:
raise Exception("This process is not in a position to determine "
"whether all processes are alive or not.")
return ray.worker._global_node.remaining_processes_alive()
def custom_log(event_type, kind, *args, **kwargs):
orig_log(event_type, kind, *args, **kwargs)
if kind == ray.worker.LOG_SPAN_START:
self.start(event_type)
elif kind == ray.worker.LOG_SPAN_END:
self.end(event_type)
elif kind == ray.worker.LOG_SPAN_POINT:
self.event(event_type)
return env
self.env = wrap(self.env)
def make_env(vector_index):
return wrap(
env_creator(
env_context.copy_with_overrides(
vector_index=vector_index, remote=remote_worker_envs)))
self.tf_sess = None
policy_dict = _validate_and_canonicalize(policy_graph, self.env)
self.policies_to_train = policies_to_train or list(policy_dict.keys())
if _has_tensorflow_graph(policy_dict):
if (ray.is_initialized()
and ray.worker._mode() != ray.worker.LOCAL_MODE
and not ray.get_gpu_ids()):
logger.info("Creating policy evaluation worker {}".format(
worker_index) +
" on CPU (please ignore any CUDA init errors)")
with tf.Graph().as_default():
if tf_session_creator:
self.tf_sess = tf_session_creator()
else:
self.tf_sess = tf.Session(
config=tf.ConfigProto(
gpu_options=tf.GPUOptions(allow_growth=True)))
with self.tf_sess.as_default():
self.policy_map, self.preprocessors = \
self._build_policy_map(policy_dict, policy_config)
else:
self.policy_map, self.preprocessors = self._build_policy_map(
def __del__(self):
"""Terminate the worker that is running this actor."""
# TODO(swang): Also clean up forked actor handles.
# Kill the worker if this is the original actor handle, created
# with Class.remote(). TODO(rkn): Even without passing handles around,
# this is not the right policy. the actor should be alive as long as
# there are ANY handles in scope in the process that created the actor,
# not just the first one.
worker = ray.worker.get_global_worker()
exported_in_current_session_and_job = (
self._ray_session_and_job == worker.current_session_and_job)
if (worker.mode == ray.worker.SCRIPT_MODE
and not exported_in_current_session_and_job):
# If the worker is a driver and driver id has changed because
# Ray was shut down re-initialized, the actor is already cleaned up
# and we don't need to send `__ray_terminate__` again.
logger.warning(
"Actor is garbage collected in the wrong driver." +
" Actor id = %s, class name = %s.", self._ray_actor_id,
self._ray_class_name)
return
if worker.connected and self._ray_original_handle:
# Note: in py2 the weakref is destroyed prior to calling __del__
# so we need to set the hardref here briefly
try:
self.__ray_terminate__._actor_hard_ref = self
self.__ray_terminate__.remote()
finally:
if isinstance(object_ids, ray.ObjectID):
object_ids = [object_ids]
if not isinstance(object_ids, list):
raise TypeError("free() expects a list of ObjectID, got {}".format(
type(object_ids)))
# Make sure that the values are object IDs.
for object_id in object_ids:
if not isinstance(object_id, ray.ObjectID):
raise TypeError("Attempting to call `free` on the value {}, "
"which is not an ray.ObjectID.".format(object_id))
unpin_object_data(object_id)
if ray.worker._mode() == ray.worker.LOCAL_MODE:
worker.local_mode_manager.free(object_ids)
return
worker.check_connected()
with profiling.profile("ray.free"):
if len(object_ids) == 0:
return
worker.core_worker.free_objects(object_ids, local_only,
delete_creating_tasks)
def get_next_failed_trial(self):
"""Gets the first trial found to be running on a node presumed dead.
Returns:
A Trial object that is ready for failure processing. None if
no failure detected.
"""
if ray.worker._mode() != ray.worker.LOCAL_MODE:
live_cluster_ips = self.get_alive_node_ips()
if live_cluster_ips - self.get_current_trial_ips():
for trial in self.get_running_trials():
if trial.node_ip and trial.node_ip not in live_cluster_ips:
return trial
return None
def custom_log(event_type, kind, *args, **kwargs):
orig_log(event_type, kind, *args, **kwargs)
if kind == ray.worker.LOG_SPAN_START:
self.start(event_type)
elif kind == ray.worker.LOG_SPAN_END:
self.end(event_type)
elif kind == ray.worker.LOG_SPAN_POINT:
self.event(event_type)
if seed is not None:
np.random.seed(seed)
random.seed(seed)
if not hasattr(self.env, "seed"):
raise ValueError("Env doesn't support env.seed(): {}".format(
self.env))
self.env.seed(seed)
try:
import torch
torch.manual_seed(seed)
except ImportError:
logger.info("Could not seed torch")
if _has_tensorflow_graph(policy_dict) and not (tf and
tf.executing_eagerly()):
if (ray.is_initialized()
and ray.worker._mode() != ray.worker.LOCAL_MODE
and not ray.get_gpu_ids()):
logger.debug("Creating policy evaluation worker {}".format(
worker_index) +
" on CPU (please ignore any CUDA init errors)")
if not tf:
raise ImportError("Could not import tensorflow")
with tf.Graph().as_default():
if tf_session_creator:
self.tf_sess = tf_session_creator()
else:
self.tf_sess = tf.Session(
config=tf.ConfigProto(
gpu_options=tf.GPUOptions(allow_growth=True)))
with self.tf_sess.as_default():
# set graph-level seed
if seed is not None:
def logger_creator(config):
# Set the working dir in the remote process, for user file writes
if not os.path.exists(remote_logdir):
os.makedirs(remote_logdir)
if not ray.worker._mode() == ray.worker.LOCAL_MODE:
os.chdir(remote_logdir)
return NoopLogger(config, remote_logdir)